Squashed commit of the following:

commit 1c07e03013ddf80c01b78419c98fde60122a65da Author: Benjamin Buchfink <[email protected]> Date: Thu Feb 16 16:41:35 2023 +0100 Updated version. commit d655b42d095fd5fc187b7734a17b360b897e63dd Author: Benjamin Buchfink <[email protected]> Date: Thu Feb 16 12:22:35 2023 +0100 Use linsearch for backmapping. commit 330becf3ffca6bbef91c33490dbb2c55adb5379c Author: Benjamin Buchfink <[email protected]> Date: Thu Feb 16 10:17:38 2023 +0100 Added linsearch option. commit c7d7cffa920950a6927a91ff1903cb795076f104 Author: Benjamin Buchfink <[email protected]> Date: Wed Feb 15 20:51:35 2023 +0100 Allow lin suffix. commit b7e01fc21aa2adee3a6f8d08a59c828e2d0c74c1 Author: Benjamin Buchfink <[email protected]> Date: Wed Feb 15 10:19:52 2023 +0100 Use linear clustering only for linclust. commit 2e82ce30f52bc4d76a02290c3641e9f1761f15a8 Author: Benjamin Buchfink <[email protected]> Date: Tue Feb 14 16:49:58 2023 +0100 Added check. commit 46966470bf2e2bd6ba4e5fce9a6f73fb4ee53007 Author: Benjamin Buchfink <[email protected]> Date: Tue Feb 14 16:02:42 2023 +0100 Added linear search to --iterate. commit 746d8a4addc0c9b5357c120f1768368380181140 Author: Benjamin Buchfink <[email protected]> Date: Tue Feb 14 12:39:31 2023 +0100 Fixed timer. commit 30233debd687e414d51e5f1bc904648b5540f06f Author: Benjamin Buchfink <[email protected]> Date: Tue Feb 14 11:15:01 2023 +0100 Added linear stage for targets. commit 1c9bf349fb07ba92b46a1f7ab719d54fb81e8b11 Author: Benjamin Buchfink <[email protected]> Date: Mon Feb 13 16:51:30 2023 +0100 No extra for lin-stage1 option. commit 190cef799c9877603f3f86b62515090323c3b909 Author: Benjamin Buchfink <[email protected]> Date: Mon Feb 13 16:34:00 2023 +0100 Added linclust command. commit fa888a19422daea83e8124ea3092aa7c3d0b180e Author: Benjamin Buchfink <[email protected]> Date: Mon Feb 13 14:59:01 2023 +0100 Fixed linclust bug. commit 20a9bd2efe2a78f4c24cbd9317b4537c7d292dc5 Author: Benjamin Buchfink <[email protected]> Date: Mon Feb 13 13:33:01 2023 +0100 Add tests to cmake. commit a95bfc459b79ddd42911f0df7ac6441d1d7961e5 Author: Benjamin Buchfink <[email protected]> Date: Mon Feb 13 13:11:17 2023 +0100 Added ctest file. commit c73bc9d8c1f6648e350c5e81b7b43048c3715f63 Merge: d8475271 1de45b3 Author: Benjamin Buchfink <[email protected]> Date: Mon Feb 13 13:10:04 2023 +0100 Merge branch 'master' into dev commit d847527139fcbb9a63ebb35a8a80586d4ef7f937 Merge: 06361689 579b497 Author: Benjamin Buchfink <[email protected]> Date: Mon Feb 13 10:36:45 2023 +0100 merged w/master commit 06361689da3aab9cbb094559e990c1e508e5e8dc Author: Benjamin Buchfink <[email protected]> Date: Mon Feb 13 10:30:58 2023 +0100 Fixed macos errors. commit 65911a2cc276db2f50dbbdc55ea1a37dcf1ff325 Author: Benjamin Buchfink <[email protected]> Date: Fri Feb 10 17:26:36 2023 +0100 Use temp files for db blocks. commit 5350c59abf3d2e5be661edc56f0fa33d5ab0d1dd Author: Benjamin Buchfink <[email protected]> Date: Fri Feb 10 15:16:10 2023 +0100 Fixed -k for view. commit f6cc746b206be49939eef8c339448d0c8b220c2b Author: Benjamin Buchfink <[email protected]> Date: Fri Feb 10 15:13:23 2023 +0100 Added merge-daa command. commit fb573774f12267218e80d2691bbbc8843b93dfa1 Author: Benjamin Buchfink <[email protected]> Date: Fri Feb 10 15:03:34 2023 +0100 Fixed config, commit 0da23199907de5f01abeb453a17e34d4d7903d31 Author: Benjamin Buchfink <[email protected]> Date: Fri Feb 10 14:42:03 2023 +0100 Added merge-daa command. commit de5db9380d527cfaeb63cc729389070fedf94e1d Author: Benjamin Buchfink <[email protected]> Date: Fri Feb 10 14:22:56 2023 +0100 Fixed error in view. commit 3e265c1b226020aa3c18e326de726730246b55e3 Author: Benjamin Buchfink <[email protected]> Date: Fri Feb 10 13:59:45 2023 +0100 Added option to turn off avx2.
bbuchfink · Feb 16, 2023 · 9006cd7 · 9006cd7
1 parent 1de45b3
commit 9006cd7
Show file tree

Hide file tree

Showing 24 changed files with 230 additions and 69 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -509,4 +509,8 @@ target_link_libraries(diamond ${ZLIB_LIBRARY} ${CMAKE_THREAD_LIBS_INIT})
 
 install(TARGETS diamond DESTINATION bin)
 
-add_test(NAME diamond COMMAND "diamond test")
+enable_testing()
+add_test(
+ NAME diamond
+ COMMAND diamond test
+)
diff --git a/src/ChangeLog b/src/ChangeLog
@@ -1,4 +1,9 @@
 [2.1.2]
+- The iterated search mode (option `--iterate`) now uses a linear-time feature as
+ the first search round.
+- Added the `linclust` command to cluster using only a single linear-time search
+ round.
+- Fixed compiler errors on macOS.
 - Fixed a bug that caused invalid alignment traceback output for the DAA `view`
  workflow.
 - Added the `merge-daa` workflow to merge DAA files.
@@ -9,6 +14,15 @@
 - Permitted the `--ignore-warnings` option for the `cluster` and `deepclust`
  workflows.
 - Use unlinked temporary files for database blocks in clustering workflows.
+- Fixed a bug that could cause invalid results when using a clustering step with
+ linearization as the final round in combination with database processing in
+ multiple super blocks.
+- The `--lin-stage1` option can now be used without compilation using the
+ `-DEXTRA=ON` cmake option.
+- Added the option to specify the `_lin` suffix for sensitivity keywords for the
+ `--iterate` option to activate linear-time feature.
+- Added the option `--linsearch` to activate linear-time feature for the search
+ workflows.
 
 [2.1.1]
 - Fixed compilation errors on non-x86 systems and for the clang compiler.
@@ -50,7 +64,8 @@
 - Added the output fields `approx_pident` and `corrected_bitscore` to the tabular
  format.
 - Added the `--lin-stage1` option to linearize comparisons in the seeding stage
- by only considering hits against the longest query sequence for identical seeds.
+ by only considering hits against the longest query sequence for identical seeds
+ (only supported when compiled with `-DEXTRA=ON`).
 - Added the `--kmer-ranking` option to rank sequences when `--lin-stage1` is used
  (only supported when compiled with `-DKEEP_TARGET_ID=ON`).
 - Added the option `--no-block-size-limit` to deactivate upper limits for the block

diff --git a/src/basic/basic.cpp b/src/basic/basic.cpp
@@ -29,7 +29,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
 #include "../util/util.h"
 #include "../stats/standard_matrix.h"
 
-const char* Const::version_string = "2.1.1";
+const char* Const::version_string = "2.1.2";
 using std::string;
 using std::vector;
 using std::count;

diff --git a/src/basic/config.cpp b/src/basic/config.cpp
@@ -200,6 +200,7 @@ Config::Config(int argc, const char **argv, bool check_io, CommandLineParser& pa
  .add_command("blastp", "Align amino acid query sequences against a protein reference database", blastp)
  .add_command("blastx", "Align DNA query sequences against a protein reference database", blastx)
  .add_command("cluster", "Cluster protein sequences", cluster)
+ .add_command("linclust", "Cluster protein sequences in linear time", LINCLUST)
  .add_command("realign", "Realign clustered sequences against their centroids", CLUSTER_REALIGN)
  .add_command("recluster", "Recompute clustering to fix errors", RECLUSTER)
  .add_command("reassign", "Reassign clustered sequences to the closest centroid", CLUSTER_REASSIGN)
@@ -248,7 +249,7 @@ Config::Config(int argc, const char **argv, bool check_io, CommandLineParser& pa
 #endif
  ;
 
- auto& general = parser.add_group("General options", { makedb, blastp, blastx, cluster, view, prep_db, getseq, dbinfo, makeidx, CLUSTER_REALIGN, GREEDY_VERTEX_COVER, DEEPCLUST, RECLUSTER, MERGE_DAA });
+ auto& general = parser.add_group("General options", { makedb, blastp, blastx, cluster, view, prep_db, getseq, dbinfo, makeidx, CLUSTER_REALIGN, GREEDY_VERTEX_COVER, DEEPCLUST, RECLUSTER, MERGE_DAA, LINCLUST });
  general.add()
  ("threads", 'p', "number of CPU threads", threads_)
  ("db", 'd', "database file", database)
@@ -266,7 +267,7 @@ Config::Config(int argc, const char **argv, bool check_io, CommandLineParser& pa
  ("taxonnodes", 0, "taxonomy nodes.dmp from NCBI", nodesdmp)
  ("taxonnames", 0, "taxonomy names.dmp from NCBI", namesdmp);
 
- auto& align_clust = parser.add_group("Aligner/Clustering options", { blastp, blastx, cluster, RECLUSTER, CLUSTER_REASSIGN, DEEPCLUST, CLUSTER_REALIGN });
+ auto& align_clust = parser.add_group("Aligner/Clustering options", { blastp, blastx, cluster, RECLUSTER, CLUSTER_REASSIGN, DEEPCLUST, CLUSTER_REALIGN, LINCLUST });
  align_clust.add()
  ("evalue", 'e', "maximum e-value to report alignments (default=0.001)", max_evalue, 0.001)
  ("tmpdir", 't', "directory for temporary files", tmpdir)
@@ -381,7 +382,7 @@ Config::Config(int argc, const char **argv, bool check_io, CommandLineParser& pa
 \tqstrand means Query strand\n\
 \n\tDefault: qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore", output_format);
 
- auto& cluster_opt = parser.add_group("Clustering options", { cluster, RECLUSTER, CLUSTER_REASSIGN, GREEDY_VERTEX_COVER, DEEPCLUST });
+ auto& cluster_opt = parser.add_group("Clustering options", { cluster, RECLUSTER, CLUSTER_REASSIGN, GREEDY_VERTEX_COVER, DEEPCLUST, LINCLUST });
  kmer_ranking = false;
  cluster_opt.add()
  ("member-cover", 0, "Minimum coverage% of the cluster member sequence (default=80.0)", member_cover, 80.0)
@@ -400,7 +401,7 @@ Config::Config(int argc, const char **argv, bool check_io, CommandLineParser& pa
 
  string algo_str;
 
- auto& advanced = parser.add_group("Advanced options", { blastp, blastx, makeidx, CLUSTER_REASSIGN, regression_test, cluster, DEEPCLUST });
+ auto& advanced = parser.add_group("Advanced options", { blastp, blastx, makeidx, CLUSTER_REASSIGN, regression_test, cluster, DEEPCLUST, LINCLUST });
  advanced.add()
  ("algo", 0, "Seed search algorithm (0=double-indexed/1=query-indexed/ctg=contiguous-seed)", algo_str)
  ("bin", 0, "number of query bins for seed search", query_bins_)
@@ -409,6 +410,8 @@ Config::Config(int argc, const char **argv, bool check_io, CommandLineParser& pa
  ("freq-masking", 0, "mask seeds based on frequency", freq_masking)
  ("freq-sd", 0, "number of standard deviations for ignoring frequent seeds", freq_sd_, 0.0)
  ("id2", 0, "minimum number of identities for stage 1 hit", min_identities_)
+ ("linsearch", 0, "only consider seed hits against longest target for identical seeds", linsearch)
+ ("lin-stage1", 0, "only consider seed hits against longest query for identical seeds", lin_stage1)
  ("xdrop", 'x', "xdrop for ungapped alignment", ungapped_xdrop, 12.3)
  ("gapped-filter-evalue", 0, "E-value threshold for gapped filter (auto)", gapped_filter_evalue_, -1.0)
  ("band", 0, "band for dynamic programming computation", padding)
@@ -597,7 +600,6 @@ Config::Config(int argc, const char **argv, bool check_io, CommandLineParser& pa
  ("chaining-stacked-hsp-ratio", 0, "", chaining_stacked_hsp_ratio, 0.5)
  ("swipe-task-size", 0, "", swipe_task_size, (int64_t)100000000)
  ("minimizer-window", 0, "", minimizer_window_)
- ("lin-stage1", 0, "", lin_stage1)
  ("min_task_trace_pts", 0, "", min_task_trace_pts, (int64_t)1024)
  ("sketch-size", 0, "", sketch_size)
  ("oid-list", 0, "", oid_list)
@@ -780,6 +782,7 @@ Config::Config(int argc, const char **argv, bool check_io, CommandLineParser& pa
  case Config::view:
  case Config::cluster:
  case Config::DEEPCLUST:
+ case Config::LINCLUST:
  case Config::regression_test:
  case Config::compute_medoids:
  case Config::CLUSTER_REASSIGN:
@@ -800,6 +803,7 @@ Config::Config(int argc, const char **argv, bool check_io, CommandLineParser& pa
  case Config::makedb:
  case Config::cluster:
  case Config::DEEPCLUST:
+ case Config::LINCLUST:
  case Config::regression_test:
  case Config::compute_medoids:
  case Config::LIST_SEEDS:
@@ -828,7 +832,7 @@ Config::Config(int argc, const char **argv, bool check_io, CommandLineParser& pa
 
  if (command == Config::blastp || command == Config::blastx || command == Config::blastn || command == Config::benchmark || command == Config::model_sim || command == Config::opt
  || command == Config::mask || command == Config::cluster || command == Config::compute_medoids || command == Config::regression_test || command == Config::CLUSTER_REASSIGN
- || command == Config::RECLUSTER || command == Config::DEEPCLUST) {
+ || command == Config::RECLUSTER || command == Config::DEEPCLUST || command == Config::LINCLUST) {
  if (tmpdir == "")
  tmpdir = extract_dir(output_file);
 

diff --git a/src/basic/config.h b/src/basic/config.h
@@ -335,6 +335,7 @@ struct Config
  bool recluster_bd;
  bool pipeline_short;
  string graph_algo;
+ bool linsearch;
 
  SequenceType dbtype;
 
@@ -350,7 +351,7 @@ struct Config
  match_file_stat = 14, model_seqs = 15, opt = 16, mask = 17, fastq2fasta = 18, dbinfo = 19, test_extra = 20, test_io = 21, db_annot_stats = 22, read_sim = 23, info = 24, seed_stat = 25,
  smith_waterman = 26, cluster = 27, translate = 28, filter_blasttab = 29, show_cbs = 30, simulate_seqs = 31, split = 32, upgma = 33, upgma_mc = 34, regression_test = 35,
  reverse_seqs = 36, compute_medoids = 37, mutate = 38, rocid = 40, makeidx = 41, find_shapes, prep_db, composition, JOIN, HASH_SEQS, LIST_SEEDS, CLUSTER_REALIGN,
- GREEDY_VERTEX_COVER, INDEX_FASTA, FETCH_SEQ, CLUSTER_REASSIGN, blastn, RECLUSTER, LENGTH_SORT, MERGE_DAA, DEEPCLUST
+ GREEDY_VERTEX_COVER, INDEX_FASTA, FETCH_SEQ, CLUSTER_REASSIGN, blastn, RECLUSTER, LENGTH_SORT, MERGE_DAA, DEEPCLUST, LINCLUST
  };
  unsigned command;
 

diff --git a/src/basic/const.h b/src/basic/const.h
@@ -25,7 +25,7 @@ struct Const
 {
 
  enum {
- build_version = 155,
+ build_version = 156,
 #ifdef SINGLE_THREADED
  seedp_bits = 0,
 #else

diff --git a/src/cluster/cascaded/cascaded.cpp b/src/cluster/cascaded/cascaded.cpp
@@ -1,6 +1,9 @@
 /****
 DIAMOND protein aligner
-Copyright (C) 2013-2018 Benjamin Buchfink <[email protected]>
+Copyright (C) 2016-2023 Max Planck Society for the Advancement of Science e.V.
+ Benjamin Buchfink
+
+Code developed by Benjamin Buchfink <[email protected]>
 
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -66,6 +69,7 @@ vector<SuperBlockId> cluster(shared_ptr<SequenceFile>& db, const shared_ptr<BitV
  config.self = true;
  config.iterate.unset();
  config.mapany = false;
+ config.linsearch = false;
  tie(config.chunk_size, config.lowmem_) = block_size(Util::String::interpret_number(config.memory_limit.get(DEFAULT_MEMORY_LIMIT)), config.sensitivity, config.lin_stage1);
 
  shared_ptr<Callback> callback(new Callback);
@@ -99,10 +103,10 @@ static pair<vector<SuperBlockId>, BitVector> update_clustering(const BitVector&
  return { current_centroids, oid_filter };
 }
 
-vector<SuperBlockId> cascaded(shared_ptr<SequenceFile>& db) {
+vector<SuperBlockId> cascaded(shared_ptr<SequenceFile>& db, bool linear) {
  if (db->sequence_count() > (int64_t)numeric_limits<SuperBlockId>::max())
  throw runtime_error("Workflow supports a maximum of " + to_string(numeric_limits<SuperBlockId>::max()) + " input sequences.");
- const auto steps = cluster_steps(config.approx_min_id);
+ const auto steps = cluster_steps(config.approx_min_id, linear);
  shared_ptr<BitVector> oid_filter(new BitVector);
  int64_t cluster_count = db->sequence_count();
  vector<SuperBlockId> centroids(cluster_count);

diff --git a/src/cluster/cascaded/cascaded.h b/src/cluster/cascaded/cascaded.h
@@ -1,6 +1,9 @@
 /****
 DIAMOND protein aligner
-Copyright (C) 2013-2018 Benjamin Buchfink <[email protected]>
+Copyright (C) 2016-2023 Max Planck Society for the Advancement of Science e.V.
+ Benjamin Buchfink
+
+Code developed by Benjamin Buchfink <[email protected]>
 
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -30,8 +33,8 @@ struct Cascaded : public ClusteringAlgorithm {
  }
 };
 
-std::vector<SuperBlockId> cascaded(std::shared_ptr<SequenceFile>& db);
-std::vector<std::string> cluster_steps(double approx_id);
+std::vector<SuperBlockId> cascaded(std::shared_ptr<SequenceFile>& db, bool linear);
+std::vector<std::string> cluster_steps(double approx_id, bool linear);
 
 struct Callback : public Consumer {
  using Edge = Util::Algo::Edge<SuperBlockId>;

diff --git a/src/cluster/cascaded/helpers.cpp b/src/cluster/cascaded/helpers.cpp
@@ -1,14 +1,37 @@
+/****
+DIAMOND protein aligner
+Copyright (C) 2019-2023 Max Planck Society for the Advancement of Science e.V.
+
+Code developed by Benjamin Buchfink <[email protected]>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+****/
+
 #include "cascaded.h"
 
 using std::string;
 using std::vector;
 
 namespace Cluster {
 
-vector<string> cluster_steps(double approx_id) {
+vector<string> cluster_steps(double approx_id, bool linear) {
  if (!config.cluster_steps.empty())
  return config.cluster_steps;
- vector<string> v = { "faster_lin", "fast" };
+ vector<string> v = { "faster_lin" };
+ if (linear)
+ return v;
+ v.push_back("fast");
  if (approx_id < 90)
  v.push_back("default");
  if (approx_id < 50)

diff --git a/src/cluster/cascaded/recluster.cpp b/src/cluster/cascaded/recluster.cpp
@@ -1,8 +1,8 @@
 /****
 DIAMOND protein aligner
-Copyright (C) 2022 Max Planck Society for the Advancement of Science e.V.
+Copyright (C) 2022-2023 Max Planck Society for the Advancement of Science e.V.
 
-Code developed by Benjamin Buchfink <benjamin.buchfink@tue.mpg.de>
+Code developed by Benjamin Buchfink <buchfink@gmail.com>
 
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -83,10 +83,12 @@ static vector<OId> recluster(shared_ptr<SequenceFile>& db, const vector<OId>& cl
  config.query_cover = config.recluster_bd ? 0 : config.member_cover;
  config.subject_cover = 0;
  config.query_or_target_cover = config.recluster_bd ? config.member_cover : 0;
- config.sensitivity = from_string<Sensitivity>(cluster_steps(config.approx_min_id).back());
+ config.sensitivity = from_string<Sensitivity>(cluster_steps(config.approx_min_id, false).back());
  //tie(config.chunk_size, config.lowmem_) = block_size(Util::String::interpret_number(config.memory_limit.get(DEFAULT_MEMORY_LIMIT)), Search::iterated_sens.at(config.sensitivity).front(), false);
  config.lowmem_ = 1;
  config.chunk_size = 4.0;
+ config.lin_stage1 = false;
+ config.linsearch = false;
  shared_ptr<Mapback> mapback = make_shared<Mapback>(unal_members.size());
  Search::run(centroid_db, unaligned, mapback);
 
@@ -157,7 +159,7 @@ static vector<OId> recluster(shared_ptr<SequenceFile>& db, const vector<OId>& cl
  unaligned.reset();
  timer.finish();
 
- const vector<OId> reclust = recluster(unmapped, convert_mapping(cascaded(unmapped), OId()), iteration + 1);
+ const vector<OId> reclust = recluster(unmapped, convert_mapping(cascaded(unmapped, false), OId()), iteration + 1);
 
  timer.go("Deallocating memory");
  unmapped.reset(); 

diff --git a/src/cluster/cascaded/wrapper.cpp b/src/cluster/cascaded/wrapper.cpp
@@ -1,3 +1,23 @@
+/****
+DIAMOND protein aligner
+Copyright (C) 2019-2023 Max Planck Society for the Advancement of Science e.V.
+
+Code developed by Benjamin Buchfink <[email protected]>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+****/
+
 #include <numeric>
 #include "cascaded.h"
 #include "../data/fasta/fasta_file.h"
@@ -26,16 +46,18 @@ namespace Cluster {
 
 struct Config {
  Config(shared_ptr<SequenceFile>& db) :
+ linclust(config.command == ::Config::LINCLUST),
  message_stream(true),
  verbosity(1),
- sens(from_string<Sensitivity>(rstrip(cluster_steps(config.approx_min_id).back(), "_lin"))),
+ sens(from_string<Sensitivity>(rstrip(cluster_steps(config.approx_min_id, linclust).back(), "_lin"))),
  output_format(init_output(-1)),
  centroids(new FastaFile("", true, FastaFile::WriteAccess())),
  seqs_processed(0),
  letters_processed(0),
  oid_to_centroid_oid(new File(Schema{ Type::INT64, Type::INT64 }, "", Flags::TEMP))
  {
  }
+ bool linclust;
  MessageStream message_stream;
  int verbosity;
  Sensitivity sens;
@@ -78,7 +100,15 @@ static vector<SuperBlockId> search_vs_centroids(shared_ptr<FastaFile>& super_blo
  config.query_cover = config.member_cover;
  config.subject_cover = 0;
  config.query_or_target_cover = 0;
- config.iterate = vector<string>();
+ if (cfg.linclust) {
+ config.iterate.unset();
+ config.linsearch = true;
+ }
+ else {
+ config.iterate = vector<string>();
+ config.linsearch = false;
+ }
+ config.lin_stage1 = false;
  tie(config.chunk_size, config.lowmem_) = block_size(Util::String::interpret_number(config.memory_limit.get(DEFAULT_MEMORY_LIMIT)), cfg.sens, false);
  cfg.centroids->set_seqinfo_ptr(0);
  shared_ptr<BestCentroid> best_centroid(new BestCentroid(super_block->sequence_count()));
@@ -119,7 +149,7 @@ void Cascaded::run() {
  unique_ptr<Util::Tsv::File> out(open_out_tsv());
 
  if (block_size >= (double)db->letters() && db->sequence_count() < numeric_limits<SuperBlockId>::max()) {
- const auto centroids = cascaded(db);
+ const auto centroids = cascaded(db, config.command == ::Config::LINCLUST);
  timer.go("Generating output");
  output_mem<SuperBlockId>(*out, *db, centroids);
  }
@@ -159,7 +189,7 @@ void Cascaded::run() {
  seqs.reset();
  timer.finish();
  }
- const vector<SuperBlockId> clustering = cascaded(unaligned_db);
+ const vector<SuperBlockId> clustering = cascaded(unaligned_db, cfg.linclust);
  timer.go("Updating clustering");
  vector<SuperBlockId> centroids;
  for (SuperBlockId i = 0; i < (SuperBlockId)unaligned.size(); ++i) {