Skip to content

Commit

Permalink
Remote code overhaul
Browse files Browse the repository at this point in the history
- Remote access code greatly improved
- Moved some remote code to `miga/common/net`
- Started move towards NCBI Datasets
- `MiGA::Json` can now `generate_plain`
- Test coverage increased
  • Loading branch information
lmrodriguezr committed Jan 22, 2024
1 parent f1daf2f commit 8228f92
Show file tree
Hide file tree
Showing 20 changed files with 651 additions and 248 deletions.
2 changes: 1 addition & 1 deletion lib/miga/cli/action/download/gtdb.rb
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def sanitize_cli

def remote_list
cli.say 'Downloading genome list'
extra = ['sp_reps_only=' + cli[:reference].to_s]
extra = { sp_reps_only: cli[:reference].to_s }
json = MiGA::RemoteDataset.download(
:gtdb, :taxon, cli[:taxon], :genomes, nil, extra
)
Expand Down
111 changes: 43 additions & 68 deletions lib/miga/cli/action/download/ncbi.rb
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,8 @@ def cli_name_modifiers(opt)
'Do not add sequence version to the dataset name',
'Only affects --complete and --chromosome'
) { |v| cli[:add_version] = v }
cli.opt_flag(
opt, 'legacy-name',
'Use dataset names based on chromosome entries instead of assembly',
:legacy_name
)
# For backwards compatibility
cli.opt_flag(opt, 'legacy-name', '::HIDE::', :legacy_name)
end

def sanitize_cli
Expand All @@ -52,89 +49,67 @@ def sanitize_cli
end

def remote_list
doc =
if cli[:ncbi_table_file]
cli.say 'Reading genome list from file'
File.open(cli[:ncbi_table_file], 'r')
else
cli.say 'Downloading genome list'
url = remote_list_url
MiGA::RemoteDataset.download_url(url)
end
ds = parse_csv_as_datasets(doc)
doc.close if cli[:ncbi_table_file]
ds
list = {}
query = remote_list_query
loop do
# Query the remote collection
page = MiGA::Json.parse(
MiGA::RemoteDataset.download(:ncbi_datasets, :genome, query, :json),
contents: true
)
break unless page&.any? && page[:reports]&.any?

# Process reports in this page
list.merge!(parse_reports_as_datasets(page[:reports]))

# Next page
break unless page[:next_page_token]
query[:page_token] = page[:next_page_token]
end
list
end

def parse_csv_as_datasets(doc)
def parse_reports_as_datasets(reports)
ds = {}
CSV.parse(doc, headers: true).each do |r|
asm = r['assembly']
reports.each do |r|
asm = r[:accession]
next if asm.nil? || asm.empty? || asm == '-'

rep = remote_row_replicons(r)
n = remote_row_name(r, rep, asm)

# Register for download
n = remote_report_name(r, asm)
ds[n] = {
ids: [asm], db: :assembly, universe: :ncbi,
md: {
type: :genome, ncbi_asm: asm, strain: r['strain']
type: :genome, ncbi_asm: asm, strain: r.dig(:organism, :infraspecific_names, :strain)
}
}
ds[n][:md][:ncbi_nuccore] = rep.join(',') unless rep.nil?
unless r['release_date'].nil?
ds[n][:md][:release_date] = Time.parse(r['release_date']).to_s
end
date = r.dig(:assembly_info, :release_date)
ds[n][:md][:release_date] = Time.parse(date).to_s if date
ds[n][:md][:ncbi_dataset] = r
end
ds
end

def remote_row_replicons(r)
return if r['replicons'].nil?

r['replicons']
.split('; ')
.map { |i| i.gsub(/.*:/, '') }
.map { |i| i.gsub(%r{/.*}, '') }
end

def remote_row_name(r, rep, asm)
return r['#organism'].miga_name if cli[:legacy_name] && cli[:reference]

if cli[:legacy_name] && ['Complete', ' Chromosome'].include?(r['level'])
acc = rep.nil? ? '' : rep.first
else
acc = asm
end
def remote_report_name(r, asm)
acc = "#{asm}"
acc.gsub!(/\.\d+\Z/, '') unless cli[:add_version]
"#{r['#organism']}_#{acc}".miga_name
org = r.dig(:organism, :organism_name)
acc = "#{org}_#{acc}" if org
acc.miga_name
end

def remote_list_url
url_base = 'https://www.ncbi.nlm.nih.gov/genomes/solr2txt.cgi?'
url_param = {
q: '[display()].' \
'from(GenomeAssemblies).' \
'usingschema(/schema/GenomeAssemblies).' \
'matching(tab==["Prokaryotes"] and q=="' \
"#{cli[:taxon]&.tr('"', "'")}\"",
fields: 'organism|organism,assembly|assembly,replicons|replicons,' \
'level|level,release_date|release_date,strain|strain',
nolimit: 'on'
}
def remote_list_query
q = { taxons: [cli[:taxon]], filters: {} }
if cli[:reference]
url_param[:q] += ' and refseq_category==["representative"]'
q[:filters][:reference_only] = true
else
status = {
complete: 'Complete',
chromosome: ' Chromosome', # <- The leading space is *VERY* important!
scaffold: 'Scaffold',
contig: 'Contig'
}.map { |k, v| '"' + v + '"' if cli[k] }.compact.join(',')
url_param[:q] += ' and level==[' + status + ']'
q[:assembly_level] = {
contig: 'contig',
scaffold: 'scaffold',
chromosome: 'chromosome',
complete: 'complete_genome'
}.map { |k, v| '"' + v + '"' if cli[k] }.compact
end
url_param[:q] += ')'
url_base + URI.encode_www_form(url_param)
q
end
end
3 changes: 1 addition & 2 deletions lib/miga/cli/action/download/seqcode.rb
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,7 @@ def remote_list

while current_page <= total_pages
json = MiGA::RemoteDataset.download(
:seqcode, :'type-genomes', nil, :json, nil,
["page=#{current_page}"]
:seqcode, :'type-genomes', nil, :json, nil, page: current_page
)
doc = MiGA::Json.parse(json, contents: true)
current_page = doc[:current_page] + 1
Expand Down
9 changes: 1 addition & 8 deletions lib/miga/cli/action/ncbi_get.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@ class MiGA::Cli::Action::NcbiGet < MiGA::Cli::Action

def parse_cli
cli.defaults = {
query: false, unlink: false,
reference: false, legacy_name: false,
query: false, unlink: false, reference: false,
complete: false, chromosome: false,
scaffold: false, contig: false, add_version: true, dry: false,
get_md: false, only_md: false, save_every: 1
Expand All @@ -29,12 +28,6 @@ def parse_cli
'--api-key STRING',
'::HIDE::' # For backwards compatibility
) { |v| ENV['NCBI_API_KEY'] = v }
opt.on(
'--ncbi-table-file STRING',
'::HIDE::' # Only meant for debugging
# It can take the table returned by NCBI and parse it from a file
# instead of downloading it directly
) { |v| cli[:ncbi_table_file] = v }
opt.on(
'--ncbi-api-key STRING',
'NCBI API key'
Expand Down
21 changes: 15 additions & 6 deletions lib/miga/cli/action/wf.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ def default_opts_for_wf
cli.expect_files = true
cli.defaults = {
clean: false, project_type: :genomes, dataset_type: :popgenome,
ncbi_draft: true, min_qual: MiGA::Project.OPTIONS[:min_qual][:default],
ncbi_draft: true, ncbi_ref: false,
min_qual: MiGA::Project.OPTIONS[:min_qual][:default],
prepare_and_exit: false
}
end
Expand Down Expand Up @@ -39,14 +40,21 @@ def opts_for_wf(opt, files_desc, params = {})
'-T', '--ncbi-taxon STRING',
'Download all the genomes in NCBI classified as this taxon'
) { |v| cli[:ncbi_taxon] = v }
opt.on(
'--no-draft', '::HIDE::' # Deprecated
) { |v| cli[:ncbi_draft] = v }
opt.on(
'--ncbi-complete',
'Only download complete genomes, not drafts (requires -T)'
) { |v| cli[:ncbi_draft] = !v }
opt.on(
'--ncbi-ref',
'Only download RefSeq reference genomes (requires -T)'
) { |v| cli[:ncbi_ref] = v }
opt.on(
'-G', '--gtdb-taxon STRING',
'Download all the genomes in GTDB classified as this taxon'
) { |v| cli[:gtdb_taxon] = v }
opt.on(
'--no-draft',
'Only download complete genomes, not drafts (requires -T)'
) { |v| cli[:ncbi_draft] = v }
opt.on(
'--gtdb-ref',
'Only download reference anchor genomes in GTDB (requires -G)'
Expand Down Expand Up @@ -170,7 +178,8 @@ def initialize_empty_project(metadata)
def download_datasets
# Download datasets from NCBI
unless cli[:ncbi_taxon].nil?
what = cli[:ncbi_draft] ? '--all' : '--complete'
what = cli[:ncbi_ref] ? '--reference' :
cli[:ncbi_draft] ? '--all' : '--complete'
cmd = ['ncbi_get', '-P', cli[:outdir], '-T', cli[:ncbi_taxon], what]
cmd += ['--max', cli[:max_download]] if cli[:max_download]
call_cli(cmd)
Expand Down
2 changes: 1 addition & 1 deletion lib/miga/cli/opt_helper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def opt_common(opt)
'-h', '--help',
'Display this screen'
) do
puts opt.to_s.gsub(/^.*\s+::HIDE::\s*$/, '')
puts opt.to_a.select { |i| i !~ /\s::HIDE::\s/ }
exit
end
opt.separator ''
Expand Down
Loading

0 comments on commit 8228f92

Please sign in to comment.