summaryrefslogtreecommitdiff
path: root/ruffsort.cpp
diff options
context:
space:
mode:
authorDamien Zammit <damien@zamaudio.com>2023-09-17 19:19:02 +1000
committerDamien Zammit <damien@zamaudio.com>2023-09-17 19:19:02 +1000
commita0a0eb2b879f9f0cce813b3a1065897812224533 (patch)
treeeb8ad651e54dd02390d9c288284eb9e05f3ac671 /ruffsort.cpp
Initial commit, creates suffix array and searches one query
Diffstat (limited to 'ruffsort.cpp')
-rw-r--r--ruffsort.cpp156
1 files changed, 156 insertions, 0 deletions
diff --git a/ruffsort.cpp b/ruffsort.cpp
new file mode 100644
index 0000000..d1b4189
--- /dev/null
+++ b/ruffsort.cpp
@@ -0,0 +1,156 @@
+// Copyright (C) 2023 Damien Zammit <damien@zamaudio.com>
+// Copyright (C) 2010 Herman Stehouwer
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+#include <sstream>
+#include <iostream>
+#include <string>
+#include "suffixarray.h"
+#include "fastastring.h"
+#include <vector>
+#include <fstream>
+#include <cstdlib>
+#include <limits>
+
+#define HAVE_GETOPT_H
+
+#ifdef HAVE_GETOPT_H
+#define GNU_SOURCE
+#include <getopt.h>
+#else
+extern "C" {
+ char* optarg;
+ extern int optind, opterr, optopt;
+ struct option { const char *name; int has_arg; int *flag; int val; };
+#define no_argument 0
+#define required_argument 1
+#define optional_argument 2
+#ifdef HAVE_GETOPT_LONG_ONLY
+ extern int getopt_long_only (int argc, char * const argv[],
+ const char *optstring, const struct option *longopts, int
+ *longindex);
+#else
+#warning \
+Gnu Getopt Library not found: \
+cannot implement long option handling
+
+ extern int getopt(int argc, char* const argv[], const char*
+ optstring);
+ inline int getopt_long_only(int argc, char * const argv[], const char
+ *optstring, const struct option *longopts, int *longindex) {
+ return getopt(argc, argv, optstring);
+ }
+#endif
+} // extern "C"
+#endif // HAVE_GETOPT_H
+
+static struct option long_options[] = {
+ {"help", no_argument, 0, 'h'},
+ {"ref", required_argument, 0, 'r'},
+ {"query", required_argument, 0, 'q'},
+ {0, 0, 0, 0}
+};
+
+using namespace std;
+using namespace ns_suffixarray;
+
+typedef suffixarray<fastastring> sa_fasta;
+
+string program_name;
+
+void usage()
+{
+ cerr << "ruffsort -r (reference fasta) -q (query fastq to split by chr)" << endl;
+ exit(0);
+}
+
+int
+main(int argc, char* argv[]) {
+ program_name = argv[0];
+
+ ifstream is, iq;
+ string refseq;
+ string refseqsa;
+
+ // Handle arguments
+ int opt;
+ int option_index;
+ const char* optstring="hr:q:";
+ while ((opt = getopt_long_only(argc, argv, optstring, long_options,
+ &option_index)) !=-1){
+ switch (opt) {
+ case 'h':
+ usage();
+ break;
+ case 'r':
+ refseq = string(optarg);
+ refseqsa = refseq + string(".sa");
+ is.open (refseqsa.c_str(), ios::in);
+ if (!is.good()) {
+ cerr << "reference has no suffix array generated, indexing now" << endl;
+ is.open (refseq.c_str(), ios::in);
+ if (!is.good()) {
+ cerr << "cannot open input file " << optarg << endl;
+ exit (-1);
+ }
+ fastastring seqs;
+ is >> seqs;
+ seqs.push_back('~');
+ sa_fasta tree(seqs);
+ is.close();
+ tree.savesarray(refseqsa);
+ cout << "Suffix array generated" << endl;
+ }
+ is.close();
+ break;
+ case 'q':
+ iq.open (optarg, ios::in);
+ if (!iq.good()) {
+ cerr << "query fastq missing" << endl;
+ exit (-1);
+ }
+ break;
+ default:
+ cerr << "unknown argument " << opt << endl;
+ exit (-1);
+ break;
+ }
+ }
+
+ is.open (refseq.c_str(), ios::in);
+ if (!is.good()) {
+ cerr << "cannot open ref file" << endl;
+ exit (-1);
+ }
+
+ cerr << "Loading reference sequence" << endl;
+ fastastring ref;
+ is >> ref;
+ ref.push_back('~');
+
+ cerr << "Loading suffix array" << endl;
+ sa_fasta tree(refseqsa, ref);
+
+ sa_fasta::size_type result;
+ fastastring totest;
+ iq >> totest;
+ try {
+ result = tree.find_position(totest, 'N');
+ } catch (StringNotFound) {
+ cerr << "Not found" << endl;
+ exit (0);
+ }
+ cout << result << endl;
+}