diff options
author | Damien Zammit <damien@zamaudio.com> | 2023-09-17 19:19:02 +1000 |
---|---|---|
committer | Damien Zammit <damien@zamaudio.com> | 2023-09-17 19:19:02 +1000 |
commit | a0a0eb2b879f9f0cce813b3a1065897812224533 (patch) | |
tree | eb8ad651e54dd02390d9c288284eb9e05f3ac671 /ruffsort.cpp |
Initial commit, creates suffix array and searches one query
Diffstat (limited to 'ruffsort.cpp')
-rw-r--r-- | ruffsort.cpp | 156 |
1 files changed, 156 insertions, 0 deletions
diff --git a/ruffsort.cpp b/ruffsort.cpp new file mode 100644 index 0000000..d1b4189 --- /dev/null +++ b/ruffsort.cpp @@ -0,0 +1,156 @@ +// Copyright (C) 2023 Damien Zammit <damien@zamaudio.com> +// Copyright (C) 2010 Herman Stehouwer +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see <http://www.gnu.org/licenses/>. + +#include <sstream> +#include <iostream> +#include <string> +#include "suffixarray.h" +#include "fastastring.h" +#include <vector> +#include <fstream> +#include <cstdlib> +#include <limits> + +#define HAVE_GETOPT_H + +#ifdef HAVE_GETOPT_H +#define GNU_SOURCE +#include <getopt.h> +#else +extern "C" { + char* optarg; + extern int optind, opterr, optopt; + struct option { const char *name; int has_arg; int *flag; int val; }; +#define no_argument 0 +#define required_argument 1 +#define optional_argument 2 +#ifdef HAVE_GETOPT_LONG_ONLY + extern int getopt_long_only (int argc, char * const argv[], + const char *optstring, const struct option *longopts, int + *longindex); +#else +#warning \ +Gnu Getopt Library not found: \ +cannot implement long option handling + + extern int getopt(int argc, char* const argv[], const char* + optstring); + inline int getopt_long_only(int argc, char * const argv[], const char + *optstring, const struct option *longopts, int *longindex) { + return getopt(argc, argv, optstring); + } +#endif +} // extern "C" +#endif // HAVE_GETOPT_H + +static struct option long_options[] = { + {"help", no_argument, 0, 'h'}, + {"ref", required_argument, 0, 'r'}, + {"query", required_argument, 0, 'q'}, + {0, 0, 0, 0} +}; + +using namespace std; +using namespace ns_suffixarray; + +typedef suffixarray<fastastring> sa_fasta; + +string program_name; + +void usage() +{ + cerr << "ruffsort -r (reference fasta) -q (query fastq to split by chr)" << endl; + exit(0); +} + +int +main(int argc, char* argv[]) { + program_name = argv[0]; + + ifstream is, iq; + string refseq; + string refseqsa; + + // Handle arguments + int opt; + int option_index; + const char* optstring="hr:q:"; + while ((opt = getopt_long_only(argc, argv, optstring, long_options, + &option_index)) !=-1){ + switch (opt) { + case 'h': + usage(); + break; + case 'r': + refseq = string(optarg); + refseqsa = refseq + string(".sa"); + is.open (refseqsa.c_str(), ios::in); + if (!is.good()) { + cerr << "reference has no suffix array generated, indexing now" << endl; + is.open (refseq.c_str(), ios::in); + if (!is.good()) { + cerr << "cannot open input file " << optarg << endl; + exit (-1); + } + fastastring seqs; + is >> seqs; + seqs.push_back('~'); + sa_fasta tree(seqs); + is.close(); + tree.savesarray(refseqsa); + cout << "Suffix array generated" << endl; + } + is.close(); + break; + case 'q': + iq.open (optarg, ios::in); + if (!iq.good()) { + cerr << "query fastq missing" << endl; + exit (-1); + } + break; + default: + cerr << "unknown argument " << opt << endl; + exit (-1); + break; + } + } + + is.open (refseq.c_str(), ios::in); + if (!is.good()) { + cerr << "cannot open ref file" << endl; + exit (-1); + } + + cerr << "Loading reference sequence" << endl; + fastastring ref; + is >> ref; + ref.push_back('~'); + + cerr << "Loading suffix array" << endl; + sa_fasta tree(refseqsa, ref); + + sa_fasta::size_type result; + fastastring totest; + iq >> totest; + try { + result = tree.find_position(totest, 'N'); + } catch (StringNotFound) { + cerr << "Not found" << endl; + exit (0); + } + cout << result << endl; +} |