diff --git a/lib/miga/cli/action/add_result.rb b/lib/miga/cli/action/add_result.rb index 20504f50..63194aa0 100644 --- a/lib/miga/cli/action/add_result.rb +++ b/lib/miga/cli/action/add_result.rb @@ -5,13 +5,17 @@ class MiGA::Cli::Action::AddResult < MiGA::Cli::Action def parse_cli - cli.defaults = { force: false } + cli.defaults = { force: false, stdin_versions: false } cli.parse do |opt| cli.opt_object(opt, [:project, :dataset_opt, :result]) opt.on( '-f', '--force', 'Force re-indexing of the result even if it\'s already registered' ) { |v| cli[:force] = v } + opt.on( + '--stdin-versions', + 'Read Software versions from STDIN' + ) { |v| cli[:stdin_versions] = v } end end @@ -21,5 +25,22 @@ def perform cli.say "Registering result: #{cli[:result]}" r = obj.add_result(cli[:result], true, force: cli[:force]) raise 'Cannot add result, incomplete expected files' if r.nil? + + # Add Software version data + if cli[:stdin_versions] + versions = {} + sw = nil + $stdin.each do |ln| + ln = ln.chomp.strip + if ln =~ /^=> (.*)/ + sw = $1 + versions[sw] = '' + else + versions[sw] += ln + end + end + r.add_versions(versions) + r.save + end end end diff --git a/lib/miga/cli/action/download/gtdb.rb b/lib/miga/cli/action/download/gtdb.rb index 9a70f1d1..10ed5706 100644 --- a/lib/miga/cli/action/download/gtdb.rb +++ b/lib/miga/cli/action/download/gtdb.rb @@ -31,7 +31,7 @@ def sanitize_cli def remote_list cli.say 'Downloading genome list' - extra = ['sp_reps_only=' + cli[:reference].to_s] + extra = { sp_reps_only: cli[:reference].to_s } json = MiGA::RemoteDataset.download( :gtdb, :taxon, cli[:taxon], :genomes, nil, extra ) diff --git a/lib/miga/cli/action/download/ncbi.rb b/lib/miga/cli/action/download/ncbi.rb index 0248fa54..238b799c 100644 --- a/lib/miga/cli/action/download/ncbi.rb +++ b/lib/miga/cli/action/download/ncbi.rb @@ -34,11 +34,8 @@ def cli_name_modifiers(opt) 'Do not add sequence version to the dataset name', 'Only affects --complete and --chromosome' ) { |v| cli[:add_version] = v } - cli.opt_flag( - opt, 'legacy-name', - 'Use dataset names based on chromosome entries instead of assembly', - :legacy_name - ) + # For backwards compatibility + cli.opt_flag(opt, 'legacy-name', '::HIDE::', :legacy_name) end def sanitize_cli @@ -52,89 +49,67 @@ def sanitize_cli end def remote_list - doc = - if cli[:ncbi_table_file] - cli.say 'Reading genome list from file' - File.open(cli[:ncbi_table_file], 'r') - else - cli.say 'Downloading genome list' - url = remote_list_url - MiGA::RemoteDataset.download_url(url) - end - ds = parse_csv_as_datasets(doc) - doc.close if cli[:ncbi_table_file] - ds + list = {} + query = remote_list_query + loop do + # Query the remote collection + page = MiGA::Json.parse( + MiGA::RemoteDataset.download(:ncbi_datasets, :genome, query, :json), + contents: true + ) + break unless page&.any? && page[:reports]&.any? + + # Process reports in this page + list.merge!(parse_reports_as_datasets(page[:reports])) + + # Next page + break unless page[:next_page_token] + query[:page_token] = page[:next_page_token] + end + list end - def parse_csv_as_datasets(doc) + def parse_reports_as_datasets(reports) ds = {} - CSV.parse(doc, headers: true).each do |r| - asm = r['assembly'] + reports.each do |r| + asm = r[:accession] next if asm.nil? || asm.empty? || asm == '-' - rep = remote_row_replicons(r) - n = remote_row_name(r, rep, asm) - # Register for download + n = remote_report_name(r, asm) ds[n] = { ids: [asm], db: :assembly, universe: :ncbi, md: { - type: :genome, ncbi_asm: asm, strain: r['strain'] + type: :genome, ncbi_asm: asm, strain: r.dig(:organism, :infraspecific_names, :strain) } } - ds[n][:md][:ncbi_nuccore] = rep.join(',') unless rep.nil? - unless r['release_date'].nil? - ds[n][:md][:release_date] = Time.parse(r['release_date']).to_s - end + date = r.dig(:assembly_info, :release_date) + ds[n][:md][:release_date] = Time.parse(date).to_s if date + ds[n][:md][:ncbi_dataset] = r end ds end - def remote_row_replicons(r) - return if r['replicons'].nil? - - r['replicons'] - .split('; ') - .map { |i| i.gsub(/.*:/, '') } - .map { |i| i.gsub(%r{/.*}, '') } - end - - def remote_row_name(r, rep, asm) - return r['#organism'].miga_name if cli[:legacy_name] && cli[:reference] - - if cli[:legacy_name] && ['Complete', ' Chromosome'].include?(r['level']) - acc = rep.nil? ? '' : rep.first - else - acc = asm - end + def remote_report_name(r, asm) + acc = "#{asm}" acc.gsub!(/\.\d+\Z/, '') unless cli[:add_version] - "#{r['#organism']}_#{acc}".miga_name + org = r.dig(:organism, :organism_name) + acc = "#{org}_#{acc}" if org + acc.miga_name end - def remote_list_url - url_base = 'https://www.ncbi.nlm.nih.gov/genomes/solr2txt.cgi?' - url_param = { - q: '[display()].' \ - 'from(GenomeAssemblies).' \ - 'usingschema(/schema/GenomeAssemblies).' \ - 'matching(tab==["Prokaryotes"] and q=="' \ - "#{cli[:taxon]&.tr('"', "'")}\"", - fields: 'organism|organism,assembly|assembly,replicons|replicons,' \ - 'level|level,release_date|release_date,strain|strain', - nolimit: 'on' - } + def remote_list_query + q = { taxons: [cli[:taxon]], filters: {} } if cli[:reference] - url_param[:q] += ' and refseq_category==["representative"]' + q[:filters][:reference_only] = true else - status = { - complete: 'Complete', - chromosome: ' Chromosome', # <- The leading space is *VERY* important! - scaffold: 'Scaffold', - contig: 'Contig' - }.map { |k, v| '"' + v + '"' if cli[k] }.compact.join(',') - url_param[:q] += ' and level==[' + status + ']' + q[:assembly_level] = { + contig: 'contig', + scaffold: 'scaffold', + chromosome: 'chromosome', + complete: 'complete_genome' + }.map { |k, v| '"' + v + '"' if cli[k] }.compact end - url_param[:q] += ')' - url_base + URI.encode_www_form(url_param) + q end end diff --git a/lib/miga/cli/action/download/seqcode.rb b/lib/miga/cli/action/download/seqcode.rb index 0696d5fd..087f505b 100644 --- a/lib/miga/cli/action/download/seqcode.rb +++ b/lib/miga/cli/action/download/seqcode.rb @@ -29,8 +29,7 @@ def remote_list while current_page <= total_pages json = MiGA::RemoteDataset.download( - :seqcode, :'type-genomes', nil, :json, nil, - ["page=#{current_page}"] + :seqcode, :'type-genomes', nil, :json, nil, page: current_page ) doc = MiGA::Json.parse(json, contents: true) current_page = doc[:current_page] + 1 diff --git a/lib/miga/cli/action/ncbi_get.rb b/lib/miga/cli/action/ncbi_get.rb index d58c5380..859bcc3a 100644 --- a/lib/miga/cli/action/ncbi_get.rb +++ b/lib/miga/cli/action/ncbi_get.rb @@ -8,8 +8,7 @@ class MiGA::Cli::Action::NcbiGet < MiGA::Cli::Action def parse_cli cli.defaults = { - query: false, unlink: false, - reference: false, legacy_name: false, + query: false, unlink: false, reference: false, complete: false, chromosome: false, scaffold: false, contig: false, add_version: true, dry: false, get_md: false, only_md: false, save_every: 1 @@ -29,12 +28,6 @@ def parse_cli '--api-key STRING', '::HIDE::' # For backwards compatibility ) { |v| ENV['NCBI_API_KEY'] = v } - opt.on( - '--ncbi-table-file STRING', - '::HIDE::' # Only meant for debugging - # It can take the table returned by NCBI and parse it from a file - # instead of downloading it directly - ) { |v| cli[:ncbi_table_file] = v } opt.on( '--ncbi-api-key STRING', 'NCBI API key' diff --git a/lib/miga/cli/action/wf.rb b/lib/miga/cli/action/wf.rb index 2ad7375e..1265e570 100644 --- a/lib/miga/cli/action/wf.rb +++ b/lib/miga/cli/action/wf.rb @@ -8,7 +8,8 @@ def default_opts_for_wf cli.expect_files = true cli.defaults = { clean: false, project_type: :genomes, dataset_type: :popgenome, - ncbi_draft: true, min_qual: MiGA::Project.OPTIONS[:min_qual][:default], + ncbi_draft: true, ncbi_ref: false, + min_qual: MiGA::Project.OPTIONS[:min_qual][:default], prepare_and_exit: false } end @@ -39,14 +40,21 @@ def opts_for_wf(opt, files_desc, params = {}) '-T', '--ncbi-taxon STRING', 'Download all the genomes in NCBI classified as this taxon' ) { |v| cli[:ncbi_taxon] = v } + opt.on( + '--no-draft', '::HIDE::' # Deprecated + ) { |v| cli[:ncbi_draft] = v } + opt.on( + '--ncbi-complete', + 'Only download complete genomes, not drafts (requires -T)' + ) { |v| cli[:ncbi_draft] = !v } + opt.on( + '--ncbi-ref', + 'Only download RefSeq reference genomes (requires -T)' + ) { |v| cli[:ncbi_ref] = v } opt.on( '-G', '--gtdb-taxon STRING', 'Download all the genomes in GTDB classified as this taxon' ) { |v| cli[:gtdb_taxon] = v } - opt.on( - '--no-draft', - 'Only download complete genomes, not drafts (requires -T)' - ) { |v| cli[:ncbi_draft] = v } opt.on( '--gtdb-ref', 'Only download reference anchor genomes in GTDB (requires -G)' @@ -170,7 +178,8 @@ def initialize_empty_project(metadata) def download_datasets # Download datasets from NCBI unless cli[:ncbi_taxon].nil? - what = cli[:ncbi_draft] ? '--all' : '--complete' + what = cli[:ncbi_ref] ? '--reference' : + cli[:ncbi_draft] ? '--all' : '--complete' cmd = ['ncbi_get', '-P', cli[:outdir], '-T', cli[:ncbi_taxon], what] cmd += ['--max', cli[:max_download]] if cli[:max_download] call_cli(cmd) diff --git a/lib/miga/cli/objects_helper.rb b/lib/miga/cli/objects_helper.rb index d28e6f62..abda3213 100644 --- a/lib/miga/cli/objects_helper.rb +++ b/lib/miga/cli/objects_helper.rb @@ -60,6 +60,9 @@ def load_and_filter_datasets(silent = false) o &&= (d.ref? == self[:ref]) unless self[:ref].nil? o &&= (d.active? == self[:active]) unless self[:active].nil? o &&= (self[:multi] ? d.multi? : d.nonmulti?) unless self[:multi].nil? + unless self[:markers].nil? + o &&= (self[:markers] ? d.markers? : !d.markers?) + end unless self[:taxonomy].nil? o &&= !d.metadata[:tax].nil? && d.metadata[:tax].in?(self[:taxonomy]) end diff --git a/lib/miga/cli/opt_helper.rb b/lib/miga/cli/opt_helper.rb index e4d6c546..1a44e322 100644 --- a/lib/miga/cli/opt_helper.rb +++ b/lib/miga/cli/opt_helper.rb @@ -43,7 +43,7 @@ def opt_common(opt) '-h', '--help', 'Display this screen' ) do - puts opt.to_s.gsub(/^.*\s+::HIDE::\s*$/, '') + puts opt.to_a.select { |i| i !~ /\s::HIDE::\s/ } exit end opt.separator '' @@ -120,10 +120,11 @@ def opt_object(opt, what = %i[project dataset]) # as determined by +what+ an Array with any combination of: # - :ref To filter by reference (--ref) or query (--no-ref) # - :multi To filter by multiple (--multi) or single (--no-multi) species + # - :markers To filter by with (--markers) or without markers (--no-markers) # - :active To filter by active (--active) or inactive (--no-active) # - :taxonomy To filter by taxonomy (--taxonomy) # The "k-th" filter (--dataset-k) is always included - def opt_filter_datasets(opt, what = %i[ref multi active taxonomy]) + def opt_filter_datasets(opt, what = %i[ref multi markers active taxonomy]) what.each do |w| case w when :ref @@ -136,6 +137,11 @@ def opt_filter_datasets(opt, what = %i[ref multi active taxonomy]) '--[no-]multi', 'Use only multi-species (or only single-species) datasets' ) { |v| self[:multi] = v } + when :markers + opt.on( + '--[no-]markers', + 'Use only datasets with (or without) markers' + ) { |v| self[:markers] = v } when :active opt.on( '--[no-]active', diff --git a/lib/miga/common/net.rb b/lib/miga/common/net.rb index a2477497..bb903676 100644 --- a/lib/miga/common/net.rb +++ b/lib/miga/common/net.rb @@ -1,5 +1,6 @@ # frozen_string_literal: true +require 'net/http' require 'net/ftp' require 'open-uri' require 'fileutils' @@ -10,6 +11,8 @@ ## # General web-access functions shared throughout MiGA. module MiGA::Common::Net + attr_accessor :remote_connection_uri + ## # Returns the URL of the host +name+ (Symbol) def known_hosts(name) @@ -21,7 +24,7 @@ def known_hosts(name) when :miga_dist "ftp://#{main_server}/dist" else - raise "Unrecognized server name: #{host}" + raise "Unrecognized server name: #{name}" end end @@ -32,49 +35,128 @@ def main_server end ## - # Connect to an FTP +host+ (String) or a known host name (Symbol, see - # +.known_hosts+) + # Connect to an FTP +host+ (String), a known host name (Symbol, see + # +.known_hosts+), or a parsed +URI+ object + # + # Sets the attribute +remote_connection_uri+ to the parsed +URI+ object + # silently def remote_connection(host) host = known_hosts(host) if host.is_a?(Symbol) - uri = URI.parse(host) - raise 'Only FTP hosts are currently supported' unless uri.scheme == 'ftp' - - ftp = Net::FTP.new(uri.host) - ftp.passive = true - ftp.login - ftp.chdir(uri.path) - ftp + uri = host.is_a?(URI) ? host : URI.parse(host) + @remote_connection_uri = uri + + case uri.scheme + when 'ftp' + ftp = Net::FTP.new(uri.host) + ftp.passive = true + ftp.login + ftp.chdir(uri.path) unless host.is_a?(URI) + ftp + when 'http', 'https' + http = Net::HTTP.new(uri.host, uri.port) + http.read_timeout = 600 + http.use_ssl = uri.scheme == 'https' + http + else + raise 'Only FTP, HTTP, and HTTPS are currently supported' + end end ## # Download a file via FTP using the +connection+ (returned by - # +.remote_connection+) with remote name +file+ into local +target+. + # +.remote_connection+) with remote name +file+ into local +target+. If +file+ + # is +nil+, it tries to guess the file from +connection+. If +target+ is + # +nil+, it returns the read data instead # - # Alternatively, +connection+ can simply be the host (String) or a recognized - # Symbol (see +.remote_connection+), in which case the function opens the - # connection automatically + # Alternatively, +connection+ can simply be the host (String), a recognized + # Symbol (see +.remote_connection+), or a parsed +URI+ object, in which case + # the function opens the connection automatically # # Reports progress to the function block with two arguments: the # currently transferred size and the total file size - def download_file_ftp(connection, file, target) + def download_file_ftp(connection, file = nil, target = nil) # Open connection unless passed close_conn = false - if connection.is_a?(String) || connection.is_a?(Symbol) + if connection.is_a?(String) || connection.is_a?(Symbol) || + connection.is_a?(URI) connection = remote_connection(connection) + file ||= remote_connection_uri.path close_conn = true end # Prepare download - FileUtils.mkdir_p(File.dirname(target)) + FileUtils.mkdir_p(File.dirname(target)) if target filesize = connection.size(file) transferred = 0 # Get in chunks of 1KiB + ret = '' connection.getbinaryfile(file, target, 1024) do |data| yield(transferred += data.size, filesize) if block_given? + ret += data unless target end # Close connection if automatically opened connection.close if close_conn + ret unless target + end + + ## + # Submit an HTTP or HTTPS request using +url+, which should be a URL + # either as String or parsed URI. The request follows the +method+, which + # should be a Net::HTTP verb such as +:get+, +:post+, or +:patch+. All + # additional parameters for the corresponding method should be passed as + # +opts+. + def http_request(method, url, *opts) + doc = nil + remote_connection(url).start do |http| + res = http.send(method, remote_connection_uri.to_s, *opts) + if %w[301 302].include?(res.code) + DEBUG "REDIRECTION #{res.code}: #{res['location']}" + return http_request(method, res['location'], *opts) + end + res.value # To force exception unless success + doc = res.body + end + doc + end + + def net_method(method, uri, *opts) + attempts ||= 0 + DEBUG "#{method.to_s.upcase}: #{uri} #{opts}" + case method.to_sym + when :ftp + download_file_ftp(uri) + else + http_request(method, uri, *opts) + end + rescue => e + raise e if (attempts += 1) >= 3 + + sleep 5 # <- For: 429 Too Many Requests + DEBUG "RETRYING after: #{e}" + retry + end + + alias :https_request :http_request + + ## + # Normalize the encoding of +body+ to UTF-8 by attempting several + # common recodings. Code from https://github.com/seq-code/registry + def normalize_encoding(body) + # Test encodings + body.force_encoding('utf-8') + %w[iso8859-1 windows-1252 us-ascii ascii-8bit].each do |enc| + break if body.valid_encoding? + recode = body.force_encoding(enc).encode('utf-8') + body = recode if recode.valid_encoding? + end + # If nothing works, replace offending characters with '?' + unless body.valid_encoding? + body = body.encode( + 'utf-8', invalid: :replace, undef: :replace, replace: '?' + ) + end + body end end diff --git a/lib/miga/dataset.rb b/lib/miga/dataset.rb index b373091d..40ad40f5 100644 --- a/lib/miga/dataset.rb +++ b/lib/miga/dataset.rb @@ -6,6 +6,7 @@ require 'miga/metadata' require 'miga/dataset/result' require 'miga/dataset/status' +require 'miga/dataset/type' require 'miga/dataset/hooks' # This library is only required by +#closest_relatives+, so it is now @@ -18,6 +19,7 @@ class MiGA::Dataset < MiGA::MiGA include MiGA::Dataset::Result include MiGA::Dataset::Status + include MiGA::Dataset::Type include MiGA::Dataset::Hooks # Class-level @@ -56,6 +58,7 @@ def initialize(project, name, is_ref = true, metadata = {}) name.to_s @project, @name, @metadata = project, name, nil metadata[:ref] = is_ref + metadata[:type] ||= :empty @metadata_future = [ File.join(project.path, 'metadata', "#{name}.json"), metadata @@ -89,12 +92,6 @@ def save # +Project+ interface alias :save! :save - ## - # Get the type of dataset as Symbol - def type - metadata[:type] - end - ## # Delete the dataset with all it's contents (including results) and returns # nil @@ -146,22 +143,6 @@ def query? !metadata[:ref] end - ## - # Is this dataset known to be multi-organism? - def multi? - return false if metadata[:type].nil? || @@KNOWN_TYPES[type].nil? - - @@KNOWN_TYPES[type][:multi] - end - - ## - # Is this dataset known to be single-organism? - def nonmulti? - return false if metadata[:type].nil? || @@KNOWN_TYPES[type].nil? - - !@@KNOWN_TYPES[type][:multi] - end - ## # Is this dataset active? def active? diff --git a/lib/miga/dataset/base.rb b/lib/miga/dataset/base.rb index 51d7b422..c6e52977 100644 --- a/lib/miga/dataset/base.rb +++ b/lib/miga/dataset/base.rb @@ -32,6 +32,12 @@ def EXCLUDE_NOREF_TASKS @@EXCLUDE_NOREF_TASKS end + ## + # Tasks to be excluded from datasets without markers + def EXCLUDE_NOMARKER_TASKS + @@EXCLUDE_NOMARKER_TASKS + end + ## # Tasks to be executed only in datasets that are single-organism. These # tasks are ignored for multi-organism datasets or for unknown types @@ -81,45 +87,67 @@ module MiGA::Dataset::Base # Supported dataset types @@KNOWN_TYPES = { genome: { - description: 'The genome from an isolate', multi: false + description: 'The genome from an isolate', + multi: false, markers: true, + project_types: %i[mixed genomes clade] }, scgenome: { - description: 'A Single-cell Amplified Genome (SAG)', multi: false + description: 'A Single-cell Amplified Genome (SAG)', + multi: false, markers: true, + project_types: %i[mixed genomes clade] }, popgenome: { - description: 'A Metagenome-Assembled Genome (MAG)', multi: false + description: 'A Metagenome-Assembled Genome (MAG)', + multi: false, markers: true, + project_types: %i[mixed genomes clade] }, metagenome: { - description: 'A metagenome (excluding viromes)', multi: true + description: 'A metagenome (excluding viromes)', + multi: true, markers: true, + project_types: %i[mixed metagenomes] }, virome: { - description: 'A viral metagenome', multi: true + description: 'A viral metagenome', + multi: true, + markers: true, # <- We don't expect, but can be useful for contamination + project_types: %i[mixed metagenomes] + }, + plasmid: { + description: 'An individual plasmid', + multi: false, markers: false, + project_types: %i[mixed plasmids] } } ## # Returns an Array of tasks (Symbols) to be executed before project-wide tasks - @@PREPROCESSING_TASKS = [ - :raw_reads, :trimmed_reads, :read_quality, :trimmed_fasta, - :assembly, :cds, :essential_genes, :mytaxa, :mytaxa_scan, - :taxonomy, :distances, :ssu, :stats + @@PREPROCESSING_TASKS = %i[ + raw_reads trimmed_reads read_quality trimmed_fasta + assembly cds essential_genes mytaxa mytaxa_scan + taxonomy distances ssu stats ] ## # Tasks to be excluded from query datasets - @@EXCLUDE_NOREF_TASKS = [:mytaxa_scan, :taxonomy] + @@EXCLUDE_NOREF_TASKS = %i[mytaxa_scan taxonomy] @@_EXCLUDE_NOREF_TASKS_H = Hash[@@EXCLUDE_NOREF_TASKS.map { |i| [i, true] }] + ## + # Tasks to be excluded from datasets without markers + @@EXCLUDE_NOMARKER_TASKS = %i[essential_genes ssu] + @@_EXCLUDE_NOMARKER_TASKS_H = + Hash[@@EXCLUDE_NOMARKER_TASKS.map { |i| [i, true] }] + ## # Tasks to be executed only in datasets that are single-organism. These # tasks are ignored for multi-organism datasets or for unknown types - @@ONLY_NONMULTI_TASKS = [:mytaxa_scan, :taxonomy, :distances] + @@ONLY_NONMULTI_TASKS = %i[mytaxa_scan taxonomy distances] @@_ONLY_NONMULTI_TASKS_H = Hash[@@ONLY_NONMULTI_TASKS.map { |i| [i, true] }] ## # Tasks to be executed only in datasets that are multi-organism. These # tasks are ignored for single-organism datasets or for unknwon types - @@ONLY_MULTI_TASKS = [:mytaxa] + @@ONLY_MULTI_TASKS = %i[mytaxa] @@_ONLY_MULTI_TASKS_H = Hash[@@ONLY_MULTI_TASKS.map { |i| [i, true] }] ## diff --git a/lib/miga/dataset/hooks.rb b/lib/miga/dataset/hooks.rb index b3fc40c5..c3e3ae5a 100644 --- a/lib/miga/dataset/hooks.rb +++ b/lib/miga/dataset/hooks.rb @@ -15,6 +15,7 @@ # Supported hooks: # - run_lambda(lambda, args...) # - recalculate_status() +# - check_type() # - clear_run_counts() # - run_cmd(cmd) # Internal hooks: @@ -27,6 +28,7 @@ module MiGA::Dataset::Hooks def default_hooks { on_create: [[:recalculate_status]], + on_save: [[:check_type]], on_activate: [[:clear_run_counts], [:recalculate_status]], on_inactivate: [[:recalculate_status]], on_result_ready: [[:_pull_result_hooks]], @@ -51,6 +53,12 @@ def hook_recalculate_status(_hook_args, _event_args) recalculate_status end + ## + # Ensure that the dataset type exists and is compatible with the project type + def hook_check_type(_hook_args, _event_args) + check_type + end + ## # Run +cmd+ in the command-line with {{variables}}: # dataset, project, project_name, miga, object (if defined for the event) diff --git a/lib/miga/dataset/result/ignore.rb b/lib/miga/dataset/result/ignore.rb index 5b287dbd..9f0ad299 100644 --- a/lib/miga/dataset/result/ignore.rb +++ b/lib/miga/dataset/result/ignore.rb @@ -17,10 +17,14 @@ def ignore_task?(task) # - project: incompatible project # - noref: incompatible dataset, only for reference # - multi: incompatible dataset, only for multi + # - nomarkers: incompatible dataset, only for markers # - nonmulti: incompatible dataset, only for nonmulti # - complete: the task is already complete def ignore_reasons - %i[empty inactive upstream force project noref multi nonmulti complete] + %i[ + empty inactive upstream force project + noref multi nonmulti nomarkers complete + ] end ## @@ -91,9 +95,15 @@ def ignore_nonmulti?(task) ignore_by_type?(task, :nonmulti) end + ## + # Ignore +task+ because it's not a markers dataset + def ignore_nomarkers?(task) + ignore_by_type?(task, :nomarkers) + end + ## # Ignore +task+ by +type+ of dataset, one of: +:noref+, +:multi+, or - # +:nonmulti+ + # +:nonmulti+, +:nomarkers+ def ignore_by_type?(task, type) return false if force_task?(task) @@ -105,6 +115,8 @@ def ignore_by_type?(task, type) [:multi?, self.class.ONLY_MULTI_TASKS] when :nonmulti [:nonmulti?, self.class.ONLY_NONMULTI_TASKS] + when :nomarkers + [:markers?, self.class.EXCLUDE_NOMARKER_TASKS] else raise "Unexpected error, unknown type reason: #{type}" end diff --git a/lib/miga/dataset/type.rb b/lib/miga/dataset/type.rb new file mode 100644 index 00000000..358429b3 --- /dev/null +++ b/lib/miga/dataset/type.rb @@ -0,0 +1,51 @@ +## +# Helper module including specific functions for dataset type +module MiGA::Dataset::Type + ## + # Get the type of dataset as Symbol + def type + metadata[:type] + end + + ## + # Is this dataset known to be multi-organism? + def multi? + self.class.KNOWN_TYPES.dig(type, :multi) + end + + ## + # Is this dataset known to be single-organism? + def nonmulti? + y = self.class.KNOWN_TYPES.dig(type, :multi) + y.nil? ? nil : !y + end + + ## + # Are universal marker genes expected to be found in this dataset? + def markers? + self.class.KNOWN_TYPES.dig(type, :markers) + end + + ## + # Check that the dataset type is defined, known, and compatible with the + # project type and raise an exception if any of these checks fail + # + # If the dataset type is +:empty+, it returns +false+ without raising an + # exception, and true otherwise (and no tests are failed) + def check_type + raise MiGA::Error.new('Undefined dataset type') unless type + return false if type == :empty + + unless self.class.KNOWN_TYPES[type] + raise MiGA::Error.new("Unknown dataset type: #{type}") + end + unless self.class.KNOWN_TYPES[type][:project_types].include? project.type + raise MiGA::Error.new( + "Dataset type (#{type}) incompatible with project (#{project.type})" + ) + end + + true + end + +end diff --git a/lib/miga/json.rb b/lib/miga/json.rb index 65ee306a..3e2735b4 100644 --- a/lib/miga/json.rb +++ b/lib/miga/json.rb @@ -69,5 +69,14 @@ def generate(obj, path = nil) File.open(path, 'w') { |fh| fh.print y } unless path.nil? y end + + ## + # Generates and returns plain JSON to represent +obj+. + # If +path+ is passed, it saves the JSON in that file. + def generate_plain(obj, path = nil) + y = JSON.generate(obj) + File.open(path, 'w') { |fh| fh.print y } unless path.nil? + y + end end end diff --git a/lib/miga/project.rb b/lib/miga/project.rb index 02af6f6b..ad8ac211 100644 --- a/lib/miga/project.rb +++ b/lib/miga/project.rb @@ -98,7 +98,7 @@ def type ## # Is this a clade project? def clade? - type == :clade + %i[clade plasmids].include? type end ## @@ -115,6 +115,12 @@ def multi? # Same as multi? For backward compatibility alias is_multi? multi? + ## + # Does the project support the use of universal markers? + def markers? + @@KNOWN_TYPES[type][:markers] + end + ## # Is this project active? Currently a dummy function, returns # always true. diff --git a/lib/miga/project/base.rb b/lib/miga/project/base.rb index 5e046631..3dfcd5ad 100644 --- a/lib/miga/project/base.rb +++ b/lib/miga/project/base.rb @@ -89,32 +89,36 @@ module MiGA::Project::Base @@KNOWN_TYPES = { mixed: { description: 'Mixed collection of genomes, metagenomes, and viromes', - single: true, multi: true + single: true, multi: true, markers: true }, genomes: { description: 'Collection of genomes', - single: true, multi: false + single: true, multi: false, markers: true }, clade: { description: 'Collection of closely-related genomes (ANI >= 90%)', - single: true, multi: false + single: true, multi: false, markers: true }, metagenomes: { description: 'Collection of metagenomes and/or viromes', - single: false, multi: true + single: false, multi: true, markers: true + }, + plasmids: { + description: 'Collection of plasmids', + single: true, multi: false, markers: false } } ## # Project-wide distance estimations - @@DISTANCE_TASKS = [ - :project_stats, :haai_distances, :aai_distances, :ani_distances, - :clade_finding + @@DISTANCE_TASKS = %i[ + project_stats haai_distances aai_distances ani_distances + clade_finding ] ## # Project-wide tasks for :clade projects - @@INCLADE_TASKS = [:subclades, :ogs] + @@INCLADE_TASKS = %i[subclades ogs] ## # Options supported by projects @@ -131,7 +135,9 @@ module MiGA::Project::Base }, haai_p: { desc: 'Value of aai.rb -p on hAAI', type: String, - default: proc { |project| project.clade? ? 'no' : 'fastaai' }, + default: proc { |project| + project.clade? || !project.markers? ? 'no' : 'fastaai' + }, in: %w[blast+ blast blat diamond fastaai no] }, aai_p: { diff --git a/lib/miga/remote_dataset.rb b/lib/miga/remote_dataset.rb index 0659867d..a58be34a 100644 --- a/lib/miga/remote_dataset.rb +++ b/lib/miga/remote_dataset.rb @@ -193,7 +193,7 @@ def get_gtdb_taxonomy doc = MiGA::Json.parse( MiGA::RemoteDataset.download( - :gtdb, :genome, gtdb_genome, 'taxon-history', nil, [''] + :gtdb, :genome, gtdb_genome, 'taxon-history' ), contents: true ) @@ -237,15 +237,24 @@ def ncbi_asm_json_doc private def get_ncbi_taxid_from_web - return nil if ncbi_asm_json_doc.nil? + # Check first if metadata was pulled from NCBI already + taxid = metadata.dig(:ncbi_dataset, :organism, :tax_id) + return taxid if taxid - ncbi_asm_json_doc['taxid'] + # Otherwise, try to get the Assembly JSON document + ncbi_asm_json_doc&.dig('taxid') end def get_ncbi_taxid_from_ncbi + # Try first from Assembly data return get_ncbi_taxid_from_web if db == :assembly - doc = self.class.download(:ncbi, db, ids, :gb, nil, [], self).split(/\n/) + # Try from previously pulled NCBI data + taxid = metadata.dig(:ncbi_dataset, :organism, :tax_id) + return taxid if taxid + + # Try from GenBank document (obtain it) + doc = self.class.download(:ncbi, db, ids, :gb, nil, {}, self).split(/\n/) ln = doc.grep(%r{^\s+/db_xref="taxon:}).first return nil if ln.nil? @@ -283,14 +292,25 @@ def get_type_status_ncbi_nuccore(metadata) end def get_type_status_ncbi_asm(metadata) - return metadata if ncbi_asm_json_doc.nil? + from_type = nil + + # Try first from previously pulled NCBI metadata + if metadata[:ncbi_dataset] + from_type = metadata.dig( + :ncbi_dataset, :type_material, :type_display_text + ) + else + # Otherwise, check Assembly JSON document + return metadata if ncbi_asm_json_doc.nil? + + metadata[:suspect] = (ncbi_asm_json_doc['exclfromrefseq'] || []) + metadata[:suspect] = nil if metadata[:suspect].empty? + return metadata if metadata[:is_type] # If predefined, as in SeqCode - metadata[:suspect] = (ncbi_asm_json_doc['exclfromrefseq'] || []) - metadata[:suspect] = nil if metadata[:suspect].empty? - return metadata if metadata[:is_type] # If predefined, as in SeqCode + from_type = ncbi_asm_json_doc['from_type'] + from_type = ncbi_asm_json_doc['fromtype'] if from_type.nil? + end - from_type = ncbi_asm_json_doc['from_type'] - from_type = ncbi_asm_json_doc['fromtype'] if from_type.nil? case from_type when nil # Do nothing @@ -316,10 +336,11 @@ def save_assembly_to(project, name, udb) a_ctg = "#{base}.AllContigs.fna" File.open("#{base}.start", 'w') { |ofh| ofh.puts Time.now.to_s } if udb[:format] == :fasta_gz - download "#{l_ctg}.gz" - system "gzip -fd '#{l_ctg}.gz'" + l_ctg_gz = "#{l_ctg}.gz" + download(l_ctg_gz) + self.class.run_cmd(['gzip', '-f', '-d', l_ctg_gz]) else - download l_ctg + download(l_ctg) end File.unlink(a_ctg) if File.exist? a_ctg File.open("#{base}.done", 'w') { |ofh| ofh.puts Time.now.to_s } diff --git a/lib/miga/remote_dataset/base.rb b/lib/miga/remote_dataset/base.rb index 3f4100e9..94918297 100644 --- a/lib/miga/remote_dataset/base.rb +++ b/lib/miga/remote_dataset/base.rb @@ -1,4 +1,3 @@ -require 'open-uri' require 'cgi' class MiGA::RemoteDataset < MiGA::MiGA @@ -10,13 +9,24 @@ def UNIVERSE end end +def uri_safe_join(*parts) + safe = parts.map { |i| i.is_a?(Array) ? i.join(',') : i.to_s } + last = safe.pop + safe.map! { |i| i[-1] == '/' ? i : "#{i}/" } + safe << last + URI::join(*safe) +end + module MiGA::RemoteDataset::Base - @@_EUTILS = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/' - @@_EBI_API = 'https://www.ebi.ac.uk/Tools' - @@_GTDB_API = 'https://api.gtdb.ecogenomic.org' - @@_SEQCODE_API = 'https://disc-genomics.uibk.ac.at/seqcode' - @@_NCBI_API_KEY = lambda { |url| - ENV['NCBI_API_KEY'].nil? ? url : "#{url}&api_key=#{ENV['NCBI_API_KEY']}" + @@_NCBI_DATASETS = 'https://api.ncbi.nlm.nih.gov/datasets/v2alpha/' + @@_EUTILS = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/' + @@_EBI_API = 'https://www.ebi.ac.uk/Tools/' + @@_GTDB_API = 'https://api.gtdb.ecogenomic.org/' + @@_SEQCODE_API = 'https://disc-genomics.uibk.ac.at/seqcode/' + @@_EUTILS_BUILD = lambda { |service, q| + q[:api_key] = ENV['NCBI_API_KEY'] if ENV['NCBI_API_KEY'] + uri_safe_join(@@_EUTILS, "#{service}.fcgi") + .tap { |uri| uri.query = URI.encode_www_form(q) } } ## @@ -25,15 +35,13 @@ module MiGA::RemoteDataset::Base # supported keys as Symbol: # - +:dbs+ => Hash with keys being the database name and the values a Hash of # properties such as +stage+, +format+, +map_to+, and +getter+. - # - +url+ => Pattern of the URL where the data can be obtained, where +%1$s+ - # is the name of the database, +%2$s+ is the IDs, and +%3$s+ is format. - # Additional parameters can be passed to certain functions using the +extra+ - # option. + # - +uri+ => Function producing a parsed URI object, accepting one parameter: + # a Hash of options. # - +method+ => Method used to query the URL. Only +:rest+ and +:net+ are # currently supported. - # - +api_key+ => A lambda function that takes a URL as input and returns the - # URL to be downloaded with an API Key (if available). # - +map_to_universe+ => Universe where results map to. Currently unsupported. + # - +scheme+ => Function returning the scheme used as a String (ftp, http, + # https). Mandatory if method is :net. @@UNIVERSE = { web: { dbs: { @@ -41,13 +49,18 @@ module MiGA::RemoteDataset::Base assembly_gz: { stage: :assembly, format: :fasta_gz }, text: { stage: :metadata, format: :text } }, - url: '%2$s', + uri: lambda { |opts| URI.parse(opts[:ids][0]) }, + scheme: lambda { |opts| opts[:ids][0].split(':', 2)[0] }, method: :net }, ebi: { dbs: { embl: { stage: :assembly, format: :fasta } }, - url: "#{@@_EBI_API}/dbfetch/dbfetch/%1$s/%2$s/%3$s", - method: :rest + uri: lambda do |opts| + uri_safe_join( + @@_EBI_API, 'dbfetch', 'dbfetch', opts[:db], opts[:ids], opts[:format] + ) + end, + method: :get }, gtdb: { dbs: { @@ -56,15 +69,18 @@ module MiGA::RemoteDataset::Base # The 'taxon' namespace actually returns a list of genomes (+format+) taxon: { stage: :metadata, format: :genomes, map_to: [:assembly], - extra: ['sp_reps_only=false'] + extra: { sp_reps_only: false } }, # The 'genome' namespace actually returns the taxonomy (+format+) genome: { stage: :metadata, format: 'taxon-history' } }, - url: "#{@@_GTDB_API}/%1$s/%2$s/%3$s?%4$s", - method: :rest, + uri: lambda do |opts| + uri_safe_join(@@_GTDB_API, opts[:db], opts[:ids], opts[:format]) + .tap { |uri| uri.query = URI.encode_www_form(opts[:extra]) } + end, + method: :get, map_to_universe: :ncbi, - headers: 'accept: application/json' # < TODO not currently supported + headers: lambda { |_opts| { 'Accept' => 'application/json' } } }, seqcode: { dbs: { @@ -74,8 +90,11 @@ module MiGA::RemoteDataset::Base # This is the list of type genomes :'type-genomes' => { stage: :metadata, format: :json } }, - url: "#{@@_SEQCODE_API}/%1$s.json?%4$s", - method: :rest, + uri: lambda do |opts| + uri_safe_join(@@_SEQCODE_API, "#{opts[:db]}.json") + .tap { |uri| uri.query = URI.encode_www_form(opts[:extra]) } + end, + method: :get, map_to_universe: :ncbi }, ncbi: { @@ -84,9 +103,12 @@ module MiGA::RemoteDataset::Base assembly: { stage: :assembly, format: :fasta_gz, getter: :ncbi_asm }, taxonomy: { stage: :metadata, format: :xml } }, - url: "#{@@_EUTILS}efetch.fcgi?db=%1$s&id=%2$s&rettype=%3$s&retmode=text", - method: :rest, - api_key: @@_NCBI_API_KEY + uri: lambda do |opts| + @@_EUTILS_BUILD[:efetch, + db: opts[:db], id: opts[:ids], rettype: opts[:format], retmode: :text + ] + end, + method: :get }, ncbi_map: { dbs: { @@ -95,22 +117,81 @@ module MiGA::RemoteDataset::Base }, biosample: { stage: :metadata, map_to: [:assembly], format: :json } }, - url: "#{@@_EUTILS}elink.fcgi?dbfrom=%1$s&id=%2$s&db=%4$s&retmode=%3$s", - method: :net, - map_to_universe: :ncbi, - api_key: @@_NCBI_API_KEY + uri: lambda do |opts| + @@_EUTILS_BUILD[:elink, { + dbfrom: opts[:db], id: opts[:ids], retmode: opts[:format] + }.merge(opts[:extra])] + end, + method: :get, + map_to_universe: :ncbi }, ncbi_summary: { dbs: { assembly: { stage: :metadata, format: :json } }, - url: "#{@@_EUTILS}esummary.fcgi?db=%1$s&id=%2$s&retmode=%3$s", - method: :rest, - api_key: @@_NCBI_API_KEY + uri: lambda do |opts| + @@_EUTILS_BUILD[:esummary, + db: opts[:db], id: opts[:ids], retmode: opts[:format] + ] + end, + method: :get }, ncbi_search: { - dbs: { assembly: { stage: :metadata, format: :json } }, - url: "#{@@_EUTILS}esearch.fcgi?db=%1$s&term=%2$s&retmode=%3$s", - method: :rest, - api_key: @@_NCBI_API_KEY + dbs: { + assembly: { stage: :metadata, format: :json }, + taxonomy: { stage: :metadata, format: :json } + }, + uri: lambda do |opts| + @@_EUTILS_BUILD[:esearch, + db: opts[:db], term: opts[:ids], retmode: opts[:format] + ] + end, + method: :get + }, + ncbi_datasets_download: { + dbs: { genome: { stage: :assembly, format: :zip } }, + uri: lambda do |opts| + q = { include_annotation_type: 'GENOME_FASTA' } + uri_safe_join( + @@_NCBI_DATASETS, opts[:db], :accession, opts[:ids], :download + ).tap { |uri| uri.query = URI.encode_www_form(q) } + end, + method: :get, + headers: lambda do |opts| + {}.tap do |h| + h['Accept'] = 'application/zip' if opts[:format] == :zip + h['api-key'] = ENV['NCBI_API_KEY'] if ENV['NCBI_API_KEY'] + end + end + }, + ncbi_datasets: { + dbs: { + genome: { + stage: :metadata, format: :json, extra: { action: 'dataset_report' } + } + }, + uri: lambda do |opts| + uri_safe_join(@@_NCBI_DATASETS, opts[:db], opts[:extra][:action]) + end, + payload: lambda do |opts| + query = opts[:ids][0] + q = { + filters: { + assembly_version: 'current', + exclude_paired_reports: true + }.merge(query[:filters] || {}), + page_size: query[:page_size] || 1_000, + returned_content: 'COMPLETE' + } + q[:page_token] = query[:page_token] if query[:page_token] + q[:taxons] = query[:taxons] if query[:taxons] + MiGA::Json.generate_plain(q) + end, + headers: lambda do |opts| + {}.tap do |h| + h['api-key'] = ENV['NCBI_API_KEY'] if ENV['NCBI_API_KEY'] + h['Content-Type'] = 'application/json' if opts[:format] == :json + end + end, + method: :post } } end diff --git a/lib/miga/remote_dataset/download.rb b/lib/miga/remote_dataset/download.rb index 48548025..c635c273 100644 --- a/lib/miga/remote_dataset/download.rb +++ b/lib/miga/remote_dataset/download.rb @@ -6,28 +6,40 @@ class MiGA::RemoteDataset # Class-level class << self ## - # Download data from the +universe+ in the database +db+ with IDs +ids+ and - # in +format+. If passed, it saves the result in +file+. Additional - # parameters specific to the download method can be passed using +extra+. - # Returns String. The +obj+ can also be passed as MiGA::RemoteDataset or - # MiGA::Dataset. - def download(universe, db, ids, format, file = nil, extra = [], obj = nil) - ids = [ids] unless ids.is_a? Array - getter = @@UNIVERSE[universe][:dbs][db][:getter] || :download - method = @@UNIVERSE[universe][:method] - opts = { - universe: universe, - db: db, - ids: ids, - format: format, - file: file, - extra: extra, - obj: obj + # Return hash of options used internally for the getter methods, including + # by +download+. The prepared request is for data from the +universe+ in the + # database +db+ with IDs +ids+ and in +format+. If passed, it saves the + # result in +file+. Additional parameters specific to the download method + # can be passed using +extra+. The +obj+ can also be passed as + # MiGA::RemoteDataset or MiGA::Dataset + def download_opts( + universe, db, ids, format, file = nil, extra = {}, obj = nil) + universe_hash = @@UNIVERSE[universe] + database_hash = universe_hash.dig(:dbs, db) + getter = database_hash[:getter] || :download + action = database_hash[:method] || universe_hash[:method] + + { + universe: universe, db: db, ids: ids.is_a?(Array) ? ids : [ids], + format: format, file: file, obj: obj, + extra: (database_hash[:extra] || {}).merge(extra), + _fun: :"#{getter}_#{action}" } - doc = send("#{getter}_#{method}", opts) + end + + ## + # Returns String. The prequired parameters (+params+) are identical to those + # of +download_opts+ (see for details) + def download(*params) + opts = download_opts(*params) + doc = send(opts[:_fun], opts) + unless opts[:file].nil? ofh = File.open(opts[:file], 'w') - ofh.print doc.force_encoding('UTF-8') + unless opts[:file] =~ /\.([gb]?z|tar|zip|rar)$/i + doc = normalize_encoding(doc) + end + ofh.print doc ofh.close end doc @@ -39,9 +51,9 @@ def download(universe, db, ids, format, file = nil, extra = [], obj = nil) # +obj+ (mandatory): MiGA::RemoteDataset # +ids+ (mandatory): String or Array of String # +file+: String, passed to download - # +extra+: Array, passed to download + # +extra+: Hash, passed to download # +format+: String, passed to download - def ncbi_asm_rest(opts) + def ncbi_asm_get(opts) url_dir = opts[:obj].ncbi_asm_json_doc&.dig('ftppath_genbank') if url_dir.nil? || url_dir.empty? raise MiGA::RemoteDataMissingError.new( @@ -58,8 +70,8 @@ def ncbi_asm_rest(opts) ## # Download data from NCBI GenBank (nuccore) database using the REST method. - # Supported +opts+ (Hash) are the same as #download_rest and #ncbi_asm_rest. - def ncbi_gb_rest(opts) + # Supported +opts+ (Hash) are the same as #download_rest and #ncbi_asm_get. + def ncbi_gb_get(opts) # Simply use defaults, but ensure that the URL can be properly formed o = download_rest(opts.merge(universe: :ncbi, db: :nuccore)) return o unless o.strip.empty? @@ -70,54 +82,83 @@ def ncbi_gb_rest(opts) File.unlink(opts[:file]) if File.exist? opts[:file] opts[:file] = "#{opts[:file]}.gz" end - ncbi_asm_rest(opts) + ncbi_asm_get(opts) end ## - # Download data using the REST method. Supported +opts+ (Hash) include: + # Download data using the GET method. Supported +opts+ (Hash) include: # +universe+ (mandatory): Symbol - # +db+ (mandatory): Symbol - # +ids+ (mandatory): Array of String + # +db+: Symbol + # +ids+: Array of String # +format+: String - # +extra+: Array - def download_rest(opts) + # +extra+: Hash + def download_get(opts) u = @@UNIVERSE[opts[:universe]] - url = sprintf( - u[:url], opts[:db], opts[:ids].join(','), opts[:format], *opts[:extra] - ) - url = u[:api_key][url] unless u[:api_key].nil? - download_url url + download_uri(u[:uri][opts], u[:headers] ? u[:headers][opts] : {}) + end + + ## + # Download data using the POST method. Supported +opts+ (Hash) include: + # +universe+ (mandatory): Symbol + # +db+: Symbol + # +ids+: Array of String + # +format+: String + # +extra+: Hash + def download_post(opts) + u = @@UNIVERSE[opts[:universe]] + uri = u[:uri][opts] + payload = u[:payload] ? u[:payload][opts] : '' + headers = u[:headers] ? u[:headers][opts] : {} + net_method(:post, uri, payload, headers) + end + + ## + # Download data using the FTP protocol. Supported +opts+ (Hash) include: + # +universe+ (mandatory): Symbol + # +db+: Symbol + # +ids+: Array of String + # +format+: String + # +extra+: Hash + def download_ftp(opts) + u = @@UNIVERSE[opts[:universe]] + net_method(:ftp, u[:uri][opts]) + end + + ## + # Redirects to +download_get+ or +download_ftp+, depending on the URI's + # protocol + def download_net(opts) + u = @@UNIVERSE[opts[:universe]] + if u[:scheme][opts] == 'ftp' + download_ftp(opts) + else + download_get(opts) + end end ## # Alias of download_rest - alias download_net download_rest + alias download_rest download_get + + ## + # Download the given +URI+ and return the result regardless of response + # code. Attempts download up to three times before raising Net::ReadTimeout. + def download_uri(uri, headers = {}) + net_method(:get, uri, headers) + end ## # Download the given +url+ and return the result regardless of response # code. Attempts download up to three times before raising Net::ReadTimeout. - def download_url(url) - doc = '' - @timeout_try = 0 - begin - DEBUG 'GET: ' + url - URI.parse(url).open(read_timeout: 600) { |f| doc = f.read } - rescue => e - @timeout_try += 1 - raise e if @timeout_try >= 3 - - sleep 5 # <- For: 429 Too Many Requests - DEBUG "RETRYING after: #{e}" - retry - end - doc + def download_url(url, headers = {}) + download_uri(URI.parse(url), headers) end ## # Looks for the entry +id+ in +dbfrom+, and returns the linked # identifier in +db+ (or nil). def ncbi_map(id, dbfrom, db) - doc = download(:ncbi_map, dbfrom, id, :json, nil, [db]) + doc = download(:ncbi_map, dbfrom, id, :json, nil, db: db) return if doc.empty? tree = MiGA::Json.parse(doc, contents: true) @@ -134,8 +175,34 @@ module MiGA::RemoteDataset::Download ## # Download data into +file+ def download(file) - format = self.class.UNIVERSE[universe][:dbs][db][:format] - # MiGA::MiGA.DEBUG "download: #{universe}, #{db}, #{ids}, #{format}" - self.class.download(universe, db, ids, format, file, [], self) + self.class.download(*download_params(file)) + end + + def universe_hash + self.class.UNIVERSE[universe] + end + + def database_hash + universe_hash.dig(:dbs, db) + end + + def download_params(file = nil) + [universe, db, ids, database_hash[:format], file, {}, self] + end + + def download_opts(file = nil) + self.class.download_opts(*download_params(file)) + end + + def download_uri + universe_hash[:uri][download_opts] + end + + def download_headers + universe_hash[:headers][download_opts] + end + + def download_payload + universe_hash[:payload][download_opts] end end diff --git a/lib/miga/result.rb b/lib/miga/result.rb index f661095e..2d9cd454 100644 --- a/lib/miga/result.rb +++ b/lib/miga/result.rb @@ -3,6 +3,7 @@ require 'miga/result/dates' require 'miga/result/source' require 'miga/result/stats' +require 'miga/result/versions' ## # The result from a task run. It can be project-wide or dataset-specific. @@ -10,6 +11,7 @@ class MiGA::Result < MiGA::MiGA include MiGA::Result::Dates include MiGA::Result::Source include MiGA::Result::Stats + include MiGA::Result::Versions # Class-level class << self @@ -151,7 +153,11 @@ def add_files(files) ## # Initialize and #save empty result def create - @data = { created: Time.now.to_s, stats: {}, files: {} } + @data = { + created: Time.now.to_s, + stats: {}, files: {}, + versions: { 'MiGA' => MiGA::VERSION.join('.') } + } save end diff --git a/lib/miga/result/versions.rb b/lib/miga/result/versions.rb new file mode 100644 index 00000000..ea8aeb06 --- /dev/null +++ b/lib/miga/result/versions.rb @@ -0,0 +1,23 @@ +require 'miga/result/base' + +## +# Helper module including functions for results to handle software versions +module MiGA::Result::Versions + ## + # Return the versions hash + def versions + self[:versions] + end + + ## + # Add version information for the Software used by this result + def add_versions(versions) + versions.each { |k, v| self[:versions][k.to_sym] = v } + end + + ## + # Get list of software and their versions as raw text (Markdown) + def versions_md + versions.map { |k, v| "- #{k}: #{v}" }.join("\n") + end +end diff --git a/lib/miga/taxonomy/base.rb b/lib/miga/taxonomy/base.rb index d524132a..fd5806e8 100644 --- a/lib/miga/taxonomy/base.rb +++ b/lib/miga/taxonomy/base.rb @@ -6,14 +6,15 @@ class << self ## # Returns cannonical rank (Symbol) for the +rank+ String def normalize_rank(rank) + return unless rank return rank.to_sym if @@_KNOWN_RANKS_H[rank.to_sym] rank = rank.to_s.downcase - return nil if rank == 'no rank' + return if rank == 'no rank' rank = @@RANK_SYNONYMS[rank] unless @@RANK_SYNONYMS[rank].nil? rank = rank.to_sym - return nil unless @@_KNOWN_RANKS_H[rank] + return unless @@_KNOWN_RANKS_H[rank] rank end diff --git a/lib/miga/version.rb b/lib/miga/version.rb index 9a54a8cf..60a1d903 100644 --- a/lib/miga/version.rb +++ b/lib/miga/version.rb @@ -12,7 +12,7 @@ module MiGA # - String indicating release status: # - rc* release candidate, not released as gem # - [0-9]+ stable release, released as gem - VERSION = [1.3, 8, 3].freeze + VERSION = [1.3, 9, 0].freeze ## # Nickname for the current major.minor version. @@ -20,7 +20,7 @@ module MiGA ## # Date of the current gem relese. - VERSION_DATE = Date.new(2023, 10, 10) + VERSION_DATE = Date.new(2024, 1, 22) ## # References of MiGA diff --git a/scripts/assembly.bash b/scripts/assembly.bash index ffe6b095..ad66ae66 100755 --- a/scripts/assembly.bash +++ b/scripts/assembly.bash @@ -10,6 +10,7 @@ cd "$PROJECT/data/05.assembly" miga date > "$DATASET.start" # Interpose (if needed) +interpose=no TF="../04.trimmed_fasta" b=$DATASET if [[ -s "$TF/${b}.2.fasta" || -s "$TF/${b}.2.fasta.gz" ]] ; then @@ -22,6 +23,7 @@ if [[ -s "$TF/${b}.2.fasta" || -s "$TF/${b}.2.fasta.gz" ]] ; then gzip -cd "$TF/${b}.${s}.fasta.gz" > "${b}.${s}.tmp" fi done + interpose=yes FastA.interpose.pl "$cr" "$b".[12].tmp rm "$b".[12].tmp miga add_result -P "$PROJECT" -D "$DATASET" -r trimmed_fasta -f @@ -62,5 +64,17 @@ FastA.length.pl "$DATASET.AllContigs.fna" | awk '$2>=1000{print $1}' \ # Finalize miga date > "$DATASET.done" -miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f +cat < MiGA +$(miga --version) +$( + if [[ "$interpose" == "yes" ]] ; then + echo "=> Enveomics Collection: FastA.interpose.pl" + echo "version unknown" + fi +) +=> IDBA-UD +version unknown +VERSIONS diff --git a/scripts/cds.bash b/scripts/cds.bash index 5db4ba39..35ca5b6d 100755 --- a/scripts/cds.bash +++ b/scripts/cds.bash @@ -19,7 +19,7 @@ fi # Run Prodigal TYPE=$(miga ls -P "$PROJECT" -D "$DATASET" -m type | cut -f 2) case "$TYPE" in - metagenome|virome) + metagenome|virome|plasmid) prodigal -a "${DATASET}.faa" -d "${DATASET}.fna" -o "${DATASET}.gff3" \ -f gff -q -i "../05.assembly/${DATASET}.LargeContigs.fna" -p meta ;; @@ -68,6 +68,12 @@ for ext in gff3 faa fna ; do done # Finalize -miga date > "$DATASET.done" -miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f +miga date > "${DATASET}.done" +cat < MiGA +$(miga --version) +=> Prodigal +$(prodigal -v 2>&1 | grep . | perl -pe 's/^Prodigal //') +VERSIONS diff --git a/scripts/distances.bash b/scripts/distances.bash index e665e62e..7b3feb77 100755 --- a/scripts/distances.bash +++ b/scripts/distances.bash @@ -10,13 +10,111 @@ cd "$PROJECT/data/09.distances" miga date > "$DATASET.start" # Check quality -miga stats -P "$PROJECT" -D "$DATASET" -r essential_genes --compute-and-save -inactive=$(miga ls -P "$PROJECT" -D "$DATASET" -m inactive | cut -f 2) -[[ "$inactive" == "true" ]] && exit +MARKERS=$(miga ls -P "$PROJECT" -D "$DATASET" --markers \ + | wc -l | awk '{print $1}') +if [[ "$MARKERS" -eq "1" ]] ; then + miga stats -P "$PROJECT" -D "$DATASET" -r essential_genes --compute-and-save + inactive=$(miga ls -P "$PROJECT" -D "$DATASET" -m inactive | cut -f 2) + [[ "$inactive" == "true" ]] && exit +fi # Run distances ruby -I "$MIGA/lib" "$MIGA/utils/distances.rb" "$PROJECT" "$DATASET" # Finalize -miga date > "$DATASET.done" -miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f +fastaai=no +aai=no +ani=no +blast=no +blat=no +diamond=no +fastani=no +case $(miga option -P "$PROJECT" -k haai_p) in + fastaai) + fastaai=yes + ;; + diamond) + diamond=yes + aai=yes + ;; + blast) + blast=yes + aai=yes + ;; +esac + +case $(miga option -P "$PROJECT" -k aai_p) in + diamond) + diamond=yes + aai=yes + ;; + blast) + blast=yes + aai=yes + ;; +esac + +case $(miga option -P "$PROJECT" -k ani_p) in + blast) + blast=yes + ani=yes + ;; + blat) + blat=yes + ani=yes + ;; + fastani) + fastani=yes + ;; +esac + + +miga date > "${DATASET}.done" +cat < MiGA +$(miga --version) +$( + if [[ "$fastaai" == "yes" ]] ; then + echo "=> FastAAI" + fastaai version 2>&1 | perl -pe 's/.*=//' + fi +) +$( + if [[ "$fastani" == "yes" ]] ; then + echo "=> FastANI" + fastANI --version 2>&1 | grep . | perl -pe 's/^version //' + fi +) +$( + if [[ "$aai" == "yes" ]] ; then + echo "=> Enveomics Collection: aai.rb" + aai.rb --version 2>&1 | perl -pe 's/.*: //' + fi +) +$( + if [[ "$ani" == "yes" ]] ; then + echo "=> Enveomics Collection: ani.rb" + ani.rb --version 2>&1 | perl -pe 's/.*: //' + fi +) +$( + if [[ "$blast" == "yes" ]] ; then + echo "=> NCBI BLAST+" + blastp -version 2>&1 | tail -n 1 | perl -pe 's/.*: blast //' + fi +) +$( + if [[ "$blat" == "yes" ]] ; then + echo "=> BLAT" + blat 2>&1 | head -n 1 | perl -pe 's/.* v\. //' | perl -pe 's/ fast .*//' + fi +) +$( + if [[ "$diamond" == "yes" ]] ; then + echo "=> Diamond" + diamond --version 2>&1 | perl -pe 's/^diamond version //' + fi +) +VERSIONS + diff --git a/scripts/essential_genes.bash b/scripts/essential_genes.bash index 6e36f9e2..a5bb97ea 100755 --- a/scripts/essential_genes.bash +++ b/scripts/essential_genes.bash @@ -70,4 +70,17 @@ fi # Finalize miga date > "${DATASET}.done" -miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f +cat < MiGA +$(miga --version) +=> Enveomics Collection: HMM.essential.rb +$(HMM.essential.rb --version 2>&1 | perl -pe 's/.*: //') +$( + if [[ "$NOMULTI" -eq "1" ]] ; then + echo "=> FastAAI" + fastaai version 2>&1 | perl -pe 's/.*=//' + fi +) +VERSIONS + diff --git a/scripts/mytaxa.bash b/scripts/mytaxa.bash index d6ff4260..10c10485 100755 --- a/scripts/mytaxa.bash +++ b/scripts/mytaxa.bash @@ -15,7 +15,7 @@ if [[ "$MIGA_MYTAXA" == "no" ]] ; then > "$DATASET.nomytaxa.txt" else # Check type of dataset - MULTI=$(miga list_datasets -P "$PROJECT" -D "$DATASET" --multi \ + MULTI=$(miga ls -P "$PROJECT" -D "$DATASET" --multi \ | wc -l | awk '{print $1}') if [[ "$MULTI" -eq "1" ]] ; then # Check requirements @@ -98,5 +98,20 @@ else fi # Finalize -miga date > "$DATASET.done" -miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f +miga date > "${DATASET}.done" +cat < MiGA +$(miga --version) +$( + if [[ "$MIGA_MYTAXA" != "no" && "$MULTI" -eq "1" ]] ; then + echo "=> MyTaxa" + MyTaxa | grep Version: | perl -pe 's/.*: //' + echo "=> Diamond" + diamond --version 2>&1 | perl -pe 's/^diamond version //' + echo "=> Krona" + ktImportText | head -n 2 | tail -n 1 | awk '{ print $3 }' + fi +) +VERSIONS + diff --git a/scripts/mytaxa_scan.bash b/scripts/mytaxa_scan.bash index 4a64db27..41dcee0c 100755 --- a/scripts/mytaxa_scan.bash +++ b/scripts/mytaxa_scan.bash @@ -14,7 +14,7 @@ if [[ "$MIGA_MYTAXA" == "no" ]] ; then > "$DATASET.nomytaxa.txt" else # Check type of dataset - NOMULTI=$(miga list_datasets -P "$PROJECT" -D "$DATASET" --no-multi \ + NOMULTI=$(miga ls -P "$PROJECT" -D "$DATASET" --no-multi \ | wc -l | awk '{print $1}') if [[ "$NOMULTI" -eq "1" ]] ; then # Check requirements @@ -97,5 +97,18 @@ else fi # Finalize -miga date > "$DATASET.done" -miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f +miga date > "${DATASET}.done" +cat < MiGA +$(miga --version) +$( + if [[ "$MIGA_MYTAXA" != "no" && "$NOMULTI" -eq "1" ]] ; then + echo "=> MyTaxa" + MyTaxa | grep Version: | perl -pe 's/.*: //' + echo "=> Diamond" + diamond --version 2>&1 | perl -pe 's/^diamond version //' + fi +) +VERSIONS + diff --git a/scripts/read_quality.bash b/scripts/read_quality.bash index 2adfc225..7021d5f8 100755 --- a/scripts/read_quality.bash +++ b/scripts/read_quality.bash @@ -19,6 +19,10 @@ for s in 1 2 ; do done # Finalize -miga date > "$DATASET.done" -miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f +miga date > "${DATASET}.done" +cat < MiGA +$(miga --version) +VERSIONS diff --git a/scripts/ssu.bash b/scripts/ssu.bash index 6ae2abbd..f130b0ae 100755 --- a/scripts/ssu.bash +++ b/scripts/ssu.bash @@ -65,4 +65,22 @@ fi # Finalize miga date > "${DATASET}.done" -miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f +cat < MiGA +$(miga --version) +$( + if [[ -s $fa ]] ; then + echo "=> barrnap" + barrnap --version 2>&1 | perl -pe 's/^barrnap //' + echo "=> bedtools" + bedtools --version 2>&1 | perl -pe 's/^bedtools //' + echo "=> Enveomics Collection" + echo "version unknown" + echo "=> RDP Naive Bayes Classifier" + gzip -cd "${DATASET}.rdp.tsv.gz" | tail -n 1 | perl -pe 's/.*: //' + echo "=> tRNAscan-SE" + tRNAscan-SE -h 2>&1 | head -n 2 | tail -n 1 | perl -pe 's/^tRNAscan-SE //' + fi +) +VERSIONS diff --git a/scripts/stats.bash b/scripts/stats.bash index 8d5a2c70..b3c37ace 100755 --- a/scripts/stats.bash +++ b/scripts/stats.bash @@ -12,11 +12,17 @@ cd "$DIR" miga date > "${DATASET}.start" # Calculate statistics -for i in raw_reads trimmed_fasta assembly cds essential_genes distances taxonomy ssu ; do +for i in raw_reads trimmed_fasta assembly \ + cds essential_genes distances taxonomy ssu ; do echo "# $i" miga stats --compute-and-save --ignore-empty -P "$PROJECT" -D "$DATASET" -r $i done # Finalize -miga date > "$DATASET.done" -miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f +miga date > "${DATASET}.done" +cat < MiGA +$(miga --version) +VERSIONS + diff --git a/scripts/taxonomy.bash b/scripts/taxonomy.bash index 25bb8cf1..637a882e 100755 --- a/scripts/taxonomy.bash +++ b/scripts/taxonomy.bash @@ -16,5 +16,101 @@ ruby -I "$MIGA/lib" \ "$MIGA/utils/distances.rb" "$PROJECT" "$DATASET" run_taxonomy=1 # Finalize -miga date > "$DATASET.done" -miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f +fastaai=no +aai=no +ani=no +blast=no +blat=no +diamond=no +fastani=no +REF_PROJECT=$(miga option -P "$PROJECT" -k ref_project) +if [[ -S "$REF_PROJECT" ]] ; then + case $(miga option -P "$REF_PROJECT" -k haai_p) in + fastaai) + fastaai=yes + ;; + diamond) + diamond=yes + aai=yes + ;; + blast) + blast=yes + aai=yes + ;; + esac + + case $(miga option -P "$REF_PROJECT" -k aai_p) in + diamond) + diamond=yes + aai=yes + ;; + blast) + blast=yes + aai=yes + ;; + esac + + case $(miga option -P "$REF_PROJECT" -k ani_p) in + blast) + blast=yes + ani=yes + ;; + blat) + blat=yes + ani=yes + ;; + fastani) + fastani=yes + ;; + esac +fi + +miga date > "${DATASET}.done" +cat < MiGA +$(miga --version) +$( + if [[ "$fastaai" == "yes" ]] ; then + echo "=> FastAAI" + fastaai version 2>&1 | perl -pe 's/.*=//' + fi +) +$( + if [[ "$fastani" == "yes" ]] ; then + echo "=> FastANI" + fastANI --version 2>&1 | grep . | perl -pe 's/^version //' + fi +) +$( + if [[ "$aai" == "yes" ]] ; then + echo "=> Enveomics Collection: aai.rb" + aai.rb --version 2>&1 | perl -pe 's/.*: //' + fi +) +$( + if [[ "$ani" == "yes" ]] ; then + echo "=> Enveomics Collection: ani.rb" + ani.rb --version 2>&1 | perl -pe 's/.*: //' + fi +) +$( + if [[ "$blast" == "yes" ]] ; then + echo "=> NCBI BLAST+" + blastp -version 2>&1 | tail -n 1 | perl -pe 's/.*: blast //' + fi +) +$( + if [[ "$blat" == "yes" ]] ; then + echo "=> BLAT" + blat 2>&1 | head -n 1 | perl -pe 's/.* v\. //' | perl -pe 's/ fast .*//' + fi +) +$( + if [[ "$diamond" == "yes" ]] ; then + echo "=> Diamond" + diamond --version 2>&1 | perl -pe 's/^diamond version //' + fi +) +VERSIONS + diff --git a/scripts/trimmed_fasta.bash b/scripts/trimmed_fasta.bash index 978b1da5..4824b6b8 100755 --- a/scripts/trimmed_fasta.bash +++ b/scripts/trimmed_fasta.bash @@ -32,6 +32,14 @@ for x in 1.fasta 2.fasta SingleReads.fa CoupledReads.fa ; do done # Finalize -miga date > "$DATASET.done" -miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f +miga date > "${DATASET}.done" +cat < MiGA +$(miga --version) +=> Enveomics Collection: FastQ.maskQual.rb +$(FastQ.maskQual.rb --version | perl -pe 's/.* //') +=> Enveomics Collection: FastA.interpose.pl +version unknown +VERSIONS diff --git a/scripts/trimmed_reads.bash b/scripts/trimmed_reads.bash index 30fb508f..ef901524 100755 --- a/scripts/trimmed_reads.bash +++ b/scripts/trimmed_reads.bash @@ -36,15 +36,19 @@ if [[ -s "$b.2.fastq.gz" ]] ; then $CMD -1 "$b.1.fastq.gz" -2 "$b.2.fastq.gz" for s in 1 2 ; do mv "$b/${s}.post_trim_${b}.${s}.fq.gz" "${b}.${s}.clipped.fastq.gz" - mv "$b/${s}.pre_trim_QC_${b}.${s}.html" "../03.read_quality/${b}.pre.${s}.html" - mv "$b/${s}.post_trim_QC_${b}.${s}.html" "../03.read_quality/${b}.post.${s}.html" + mv "$b/${s}.pre_trim_QC_${b}.${s}.html" \ + "../03.read_quality/${b}.pre.${s}.html" + mv "$b/${s}.post_trim_QC_${b}.${s}.html" \ + "../03.read_quality/${b}.post.${s}.html" done else # Unpaired $CMD -u "$b.1.fastq.gz" mv "$b/unpaired.post_trim_${b}.1.fq.gz" "${b}.1.clipped.fastq.gz" - mv "$b/unpaired.pre_trim_QC_${b}.1.html" "../03.read_quality/${b}.pre.1.html" - mv "$b/unpaired.post_trim_QC_${b}.1.html" "../03.read_quality/${b}.post.1.html" + mv "$b/unpaired.pre_trim_QC_${b}.1.html" \ + "../03.read_quality/${b}.pre.1.html" + mv "$b/unpaired.post_trim_QC_${b}.1.html" \ + "../03.read_quality/${b}.post.1.html" fi mv "$b/Subsample_Adapter_Detection.stats.txt" \ "../03.read_quality/$b.adapters.txt" @@ -54,6 +58,22 @@ rm -r "$b" rm -f "$b".[12].fastq.gz # Finalize -miga date > "$DATASET.done" -miga add_result -P "$PROJECT" -D "$DATASET" -r "$SCRIPT" -f +miga date > "${DATASET}.done" +cat < MiGA +$(miga --version) +=> Enveomics Collection: FastQ.tag.rb +$(FastQ.tag.rb --version | perl -pe 's/.* //') +=> Multitrim +version unknown +=> FaQCs +$(FaQCs --version 2>&1 | perl -pe 's/.*: //') +=> Seqtk +$(seqtk 2>&1 | grep Version | perl -pe 's/.*: //') +=> Fastp +$(fastp --version 2>&1 | perl -pe 's/^fastp //') +=> Falco +$(falco -V 2>&1 | tee) +VERSIONS diff --git a/test/dataset_test.rb b/test/dataset_test.rb index 2a4baa91..52b7ce50 100644 --- a/test/dataset_test.rb +++ b/test/dataset_test.rb @@ -40,7 +40,7 @@ def test_save d2.save assert_not_predicate(d2, :multi?) assert_not_predicate(d2, :nonmulti?) - assert_nil(d2.metadata[:type]) + assert_equal(:empty, d2.metadata[:type]) d2.metadata[:type] = :metagenome d2.save assert_equal(:metagenome, d2.metadata[:type]) @@ -89,25 +89,40 @@ def test_preprocessing assert_equal(:trimmed_reads, d2.first_preprocessing(true)) assert_equal(:read_quality, d2.next_preprocessing(true)) assert { !d2.done_preprocessing?(true) } - # Ref and undeclared multi + + # Ref and undeclared type (empty) assert { d2.ignore_task?(:mytaxa) } assert { d2.ignore_task?(:mytaxa_scan) } assert { d2.ignore_task?(:distances) } + assert { d2.ignore_task?(:essential_genes) } + # Ref and multi d2.metadata[:type] = :metagenome assert { !d2.ignore_task?(:mytaxa) } assert { d2.ignore_task?(:mytaxa_scan) } assert { d2.ignore_task?(:distances) } + assert { !d2.ignore_task?(:essential_genes) } + # Ref and nonmulti d2.metadata[:type] = :genome assert { d2.ignore_task?(:mytaxa) } assert { !d2.ignore_task?(:mytaxa_scan) } assert { !d2.ignore_task?(:distances) } + assert { !d2.ignore_task?(:essential_genes) } + # Qry and nonmulti d2.metadata[:ref] = false assert { d2.ignore_task?(:mytaxa) } assert { d2.ignore_task?(:mytaxa_scan) } assert { !d2.ignore_task?(:distances) } + assert { !d2.ignore_task?(:essential_genes) } + + # Qry and plasmid + d2.metadata[:type] = :plasmid + assert { d2.ignore_task?(:mytaxa) } + assert { d2.ignore_task?(:mytaxa_scan) } + assert { !d2.ignore_task?(:distances) } + assert { d2.ignore_task?(:essential_genes) } end def test_profile_advance diff --git a/test/hook_test.rb b/test/hook_test.rb index c7227946..e743ccd4 100644 --- a/test/hook_test.rb +++ b/test/hook_test.rb @@ -9,9 +9,10 @@ def setup end def test_add_hook - assert_nil(dataset.hooks[:on_save]) - dataset.add_hook(:on_save, :run_lambda, Proc.new { $counter += 1 }) + assert_nil(dataset.hooks[:on_remove]) assert_equal(1, dataset.hooks[:on_save].size) + dataset.add_hook(:on_save, :run_lambda, Proc.new { $counter += 1 }) + assert_equal(2, dataset.hooks[:on_save].size) $counter = 1 dataset.save assert_equal(2, $counter) diff --git a/test/net_test.rb b/test/net_test.rb index 3ecdd74d..ef80008c 100644 --- a/test/net_test.rb +++ b/test/net_test.rb @@ -15,20 +15,36 @@ def test_remote_connection declare_remote_access m = MiGA::MiGA assert_raise { m.remote_connection(:bad_descriptor) } - assert_raise { m.remote_connection('http://microbial-genomes.org/') } + assert_raise { m.remote_connection('ssh://microbial-genomes.org/') } c = m.remote_connection(:miga_db) assert_equal(Net::FTP, c.class) c.close end - def test_download_file_ftp + def test_download_file_http declare_remote_access m = MiGA::MiGA + #o = m.http_request(:get, 'http://uibk.microbial-genomes.org/robots.txt') + o = m.http_request(:get, 'http://disc-genomics.uibk.ac.at/miga/robots.txt') + o = o.split(/\n/) + assert_equal(6, o.count) + assert_equal('#', o[1]) + assert_equal('User-agent: *', o[2]) + end + + def test_download_file_ftp + declare_remote_access f = tmpfile('t/test.txt') d = File.dirname(f) assert(!Dir.exist?(d)) - m.download_file_ftp(:miga_online_ftp, 'test.txt', f) - assert(Dir.exist?(d)) - assert_equal('miga', File.read(f).chomp) + # TODO + # Bring back when I can connect to the Gatech's FTP + ### m = MiGA::MiGA + ### m.download_file_ftp(:miga_online_ftp, 'api_test.txt', f) + ### assert(Dir.exist?(d)) + ### assert_equal('miga', File.read(f).chomp) + ### File.unlink(f) + ### m.download_file_ftp(:miga_db, '../api_test.txt', f) + ### assert_equal('miga', File.read(f).chomp) end end diff --git a/test/project_test.rb b/test/project_test.rb index 2c1f9fc6..7ff1857e 100644 --- a/test/project_test.rb +++ b/test/project_test.rb @@ -8,6 +8,12 @@ def setup initialize_miga_home end + def test_class_variables + assert(MiGA::Project.INCLADE_TASKS.is_a? Array) + assert(MiGA::Project.DISTANCE_TASKS.is_a? Array) + assert(MiGA::Project.KNOWN_TYPES.is_a? Hash) + end + def create_result_files(project, res, exts) d = MiGA::Project.RESULT_DIRS[res] (['.done'] + exts).each do |x| @@ -155,4 +161,11 @@ def test_force_result date3 = p1.add_result(:ogs, true, force: true)[:created] assert_not_equal(date1, date3) end + + def test_options + assert_equal('fastaai', project.option(:haai_p)) + assert_equal(false, project.option(:aai_save_rbm)) + project.metadata[:type] = 'clade' + assert_equal(true, project.option(:aai_save_rbm)) + end end diff --git a/test/remote_dataset_test.rb b/test/remote_dataset_test.rb index cbcf797f..54ec88eb 100644 --- a/test/remote_dataset_test.rb +++ b/test/remote_dataset_test.rb @@ -7,6 +7,7 @@ class RemoteDatasetTest < Test::Unit::TestCase def setup initialize_miga_home + ENV.delete('NCBI_API_KEY') end def test_class_universe @@ -19,7 +20,7 @@ def test_bad_remote_dataset assert_raise { MiGA::RemoteDataset.new('ids', :google, :ebi) } end - def test_rest + def test_get hiv2 = 'M30502.1' { embl: :ebi, nuccore: :ncbi }.each do |db, universe| rd = MiGA::RemoteDataset.new(hiv2, db, universe) @@ -47,7 +48,8 @@ def test_rest end def test_net_ftp - cjac = 'ftp://ftp.ebi.ac.uk/pub/databases/ena/tsa/public/gap/GAPJ01.fasta.gz' + cjac = 'ftp://ftp.ebi.ac.uk/pub/databases/ena/tsa/' \ + 'public/gap/GAPJ01.fasta.gz' n = 'Cjac_L14' rd = MiGA::RemoteDataset.new(cjac, :assembly_gz, :web) assert_equal([cjac], rd.ids) @@ -86,26 +88,56 @@ def test_update_metadata def test_type_status_asm declare_remote_access rd = MiGA::RemoteDataset.new('GCF_000018105.1', :assembly, :ncbi) - assert { rd.get_metadata[:is_type] } + md = rd.get_metadata + assert(md[:is_type]) end def test_nontype_status_asm declare_remote_access rd = MiGA::RemoteDataset.new('GCA_004684205.1', :assembly, :ncbi) - assert { !rd.get_metadata[:is_type] } + md = rd.get_metadata + assert(!md[:is_type]) end def test_type_status_nuccore declare_remote_access rd = MiGA::RemoteDataset.new('NC_019748.1', :nuccore, :ncbi) - assert { rd.get_metadata[:is_type] } + md = rd.get_metadata + assert(md[:is_type]) end def test_ref_type_status declare_remote_access rd = MiGA::RemoteDataset.new('GCA_003144295.1', :assembly, :ncbi) - assert { !rd.get_metadata[:is_type] } - assert { rd.get_metadata[:is_ref_type] } + md = rd.get_metadata + assert(!md[:is_type]) + assert(md[:is_ref_type]) + end + + def test_gtdb_taxonomy + declare_remote_access + rd = MiGA::RemoteDataset.new('GCA_018200315.1', :assembly, :gtdb) + md = rd.get_metadata + assert(!md[:is_type]) + assert_not_nil(md[:gtdb_release]) + assert(md[:tax].is_a? MiGA::Taxonomy) + assert_equal('GCA_018200315.1', md[:gtdb_assembly]) + assert_equal('gtdb', md[:tax][:ns]) + assert_equal('Bacteroidia', md[:tax][:c]) + end + + def test_gtdb_alt_taxonomy + declare_remote_access + rd = MiGA::RemoteDataset.new('GCA_018200315.1', :assembly, :gtdb) + rd.metadata[:get_ncbi_taxonomy] = true + md = rd.get_metadata + assert(md[:tax].is_a? MiGA::Taxonomy) + assert_equal('ncbi', md[:tax][:ns]) + assert_equal('Flavobacteriia', md[:tax][:c]) + assert(md[:tax].alternative(1).is_a? MiGA::Taxonomy) + assert(md[:tax].alternative(:gtdb).is_a? MiGA::Taxonomy) + assert_equal('gtdb', md[:tax].alternative(1)[:ns]) + assert_equal('gtdb', md[:tax].alternative(:gtdb)[:ns]) end def test_missing_data @@ -114,6 +146,73 @@ def test_missing_data assert_raise(MiGA::RemoteDataMissingError) { rd.save_to(project, 'bad') } end + def test_gtdb_request + # No remote access needed + rd = MiGA::RemoteDataset.new('g__Macondimonas', :taxon, :gtdb) + u = rd.download_uri + h = rd.download_headers + + assert(u.is_a? URI) + assert_equal('https', u.scheme) + assert_equal('genomes', File.basename(u.path)) + + assert(h.is_a? Hash) + assert_equal(1, h.size) + assert_equal('application/json', h['Accept']) + end + + def test_ncbi_datasets_download_request + # No remote access needed + rd = MiGA::RemoteDataset.new( + 'GCF_004684205.1', :genome, :ncbi_datasets_download + ) + u = rd.download_uri + h = rd.download_headers + + assert(u.is_a? URI) + assert_equal('https', u.scheme) + assert_equal('download', File.basename(u.path)) + + assert(h.is_a? Hash) + assert_equal(1, h.size) + assert_equal('application/zip', h['Accept']) + + ENV['NCBI_API_KEY'] = 'Not-a-real-key' + h = rd.download_headers + ENV.delete('NCBI_API_KEY') + assert_equal(2, h.size) + assert_equal('Not-a-real-key', h['api-key']) + end + + def test_seqcode_request + # No remote access needed + rd = MiGA::RemoteDataset.new(nil, 'type-genomes', :seqcode) + u = rd.download_uri + + assert(u.is_a? URI) + assert_equal('https', u.scheme) + assert_equal('type-genomes.json', File.basename(u.path)) + end + + def test_ncbi_datasets_request + rd = MiGA::RemoteDataset.new({ taxons: 'Bos' }, :genome, :ncbi_datasets) + u = rd.download_uri + h = rd.download_headers + p = rd.download_payload + + assert(u.is_a? URI) + assert_equal('https', u.scheme) + assert_equal('dataset_report', File.basename(u.path)) + + assert(h.is_a? Hash) + assert_equal(1, h.size) + assert_equal('application/json', h['Content-Type']) + + assert(p.is_a? String) + assert_equal('{', p[0]) + assert_equal('}', p[-1]) + end + # This test is too expensive (too much time to run it!) # def test_net_timeout # declare_remote_access diff --git a/test/result_test.rb b/test/result_test.rb index 4cf106e5..e16ec83d 100644 --- a/test/result_test.rb +++ b/test/result_test.rb @@ -6,27 +6,15 @@ class ResultTest < Test::Unit::TestCase def setup initialize_miga_home - FileUtils.touch( - File.join( - project.path, 'data', '02.trimmed_reads', - "#{dataset.name}.1.clipped.fastq" - ) - ) - FileUtils.touch( - File.join( - project.path, 'data', '02.trimmed_reads', "#{dataset.name}.done" - ) - ) - FileUtils.touch( - File.join( - project.path, 'data', '10.clades', '01.find', 'miga-project.empty' - ) - ) - FileUtils.touch( - File.join( - project.path, 'data', '10.clades', '01.find', 'miga-project.done' - ) - ) + to_touch = [ + ['02.trimmed_reads', "#{dataset.name}.1.clipped.fastq"], + ['02.trimmed_reads', "#{dataset.name}.done"], + ['10.clades', '01.find', 'miga-project.empty'], + ['10.clades', '01.find', 'miga-project.done'] + ] + to_touch.each do |path| + FileUtils.touch(File.join(project.path, 'data', *path)) + end end def test_add_result @@ -89,4 +77,42 @@ def test_dates r = dataset.add_result(:trimmed_reads) assert_equal(5.0, r.running_time) end + + def test_status + d = dataset + assert_equal(:ignore_empty, d.result_status(:trimmed_reads)) + d.add_result(:trimmed_reads) + assert_equal(:-, d.result_status(:raw_reads)) + assert_equal(:complete, d.result_status(:trimmed_reads)) + assert_equal(:pending, d.result_status(:read_quality)) + assert_equal(:pending, d.result_status(:assembly)) + + h = d.results_status + assert(h.is_a? Hash) + assert_equal(:-, h[:raw_reads]) + assert_equal(:complete, h[:trimmed_reads]) + assert_equal(:pending, h[:read_quality]) + + # Test the "advance" interface from Project + a = project.profile_datasets_advance + assert(a.is_a? Array) + assert_equal(1, a.size) + assert(a[0].is_a? Array) + assert_equal([0, 1, 2, 2], a[0][0..3]) + end + + def test_versions + r = dataset.add_result(:trimmed_reads) + assert_respond_to(r, :add_versions) + assert_respond_to(r, :versions_md) + assert_equal(MiGA::VERSION.join('.'), r.versions[:MiGA]) + assert_nil(r.versions[:GoodSoftware]) + + r.add_versions('GoodSoftware' => '1.2.3') + assert_equal('1.2.3', r.versions[:GoodSoftware]) + + md = r.versions_md + assert_equal('-', md[0]) + assert_equal(2, md.split("\n").size) + end end diff --git a/test/taxonomy_test.rb b/test/taxonomy_test.rb index 2b96255b..8522c58c 100644 --- a/test/taxonomy_test.rb +++ b/test/taxonomy_test.rb @@ -92,8 +92,12 @@ def test_alternative end def test_reset - tx = MiGA::Taxonomy.new('ns:Letters d:Latin s:A', nil, - ['ns:Words d:English s:A', 'ns:Music d:Tone s:A']) + tx = MiGA::Taxonomy.new( + 'ns:Letters d:Latin s:A', nil, + ['ns:Words d:English s:A', 'ns:Music d:Tone s:A'] + ) + assert_equal('Latin', tx.domain) + # Reset assert_equal(2, tx.alternative.size) assert_equal('Letters', tx.namespace) @@ -102,11 +106,13 @@ def test_reset assert_nil(tx.namespace) tx.reset('ns:Letters d:Latin s:A') assert_equal('Letters', tx.namespace) + # Change of alternative assert_equal('ns:Words d:English s:A', tx.alternative('Words').to_s) tx.add_alternative(MiGA::Taxonomy.new('ns:Words d:Spanish s:A')) assert_equal('ns:Words d:Spanish s:A', tx.alternative('Words').to_s) - # Change of master + + # Change of main assert_equal('ns:Letters d:Latin s:A', tx.to_s) tx.add_alternative(MiGA::Taxonomy.new('ns:Letters d:Unicode s:A')) assert_equal('ns:Letters d:Unicode s:A', tx.to_s) diff --git a/utils/distance/runner.rb b/utils/distance/runner.rb index 292c4ec5..c721b21d 100644 --- a/utils/distance/runner.rb +++ b/utils/distance/runner.rb @@ -54,7 +54,9 @@ def go_ref! # first-come-first-serve traverse sbj = [] ref_project.each_dataset do |ds| - sbj << ds if ds.ref? && !ds.multi? && ds.result(:essential_genes) + torun = ds.ref? && !ds.multi? + torun &&= ds.result(:essential_genes) || (!ds.markers? && ds.result(:cds)) + sbj << ds if torun end ani_after_aai(sbj) diff --git a/utils/distances.rb b/utils/distances.rb index 7931b2fd..cda2e26c 100755 --- a/utils/distances.rb +++ b/utils/distances.rb @@ -4,6 +4,6 @@ project = ARGV.shift dataset = ARGV.shift -opts = Hash[ARGV.map { |i| i.split("=", 2).tap { |j| j[0] = j[0].to_sym } }] +opts = Hash[ARGV.map { |i| i.split('=', 2).tap { |j| j[0] = j[0].to_sym } }] runner = MiGA::DistanceRunner.new(project, dataset, opts) runner.go!