Skip to content

Commit

Permalink
Merge pull request #172 from bio-miga/plasmids
Browse files Browse the repository at this point in the history
Plasmids and Remote Overhaul
  • Loading branch information
lmrodriguezr authored Jan 22, 2024
2 parents 3240538 + 8228f92 commit a9c90f5
Show file tree
Hide file tree
Showing 45 changed files with 1,211 additions and 334 deletions.
23 changes: 22 additions & 1 deletion lib/miga/cli/action/add_result.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,17 @@

class MiGA::Cli::Action::AddResult < MiGA::Cli::Action
def parse_cli
cli.defaults = { force: false }
cli.defaults = { force: false, stdin_versions: false }
cli.parse do |opt|
cli.opt_object(opt, [:project, :dataset_opt, :result])
opt.on(
'-f', '--force',
'Force re-indexing of the result even if it\'s already registered'
) { |v| cli[:force] = v }
opt.on(
'--stdin-versions',
'Read Software versions from STDIN'
) { |v| cli[:stdin_versions] = v }
end
end

Expand All @@ -21,5 +25,22 @@ def perform
cli.say "Registering result: #{cli[:result]}"
r = obj.add_result(cli[:result], true, force: cli[:force])
raise 'Cannot add result, incomplete expected files' if r.nil?

# Add Software version data
if cli[:stdin_versions]
versions = {}
sw = nil
$stdin.each do |ln|
ln = ln.chomp.strip
if ln =~ /^=> (.*)/
sw = $1
versions[sw] = ''
else
versions[sw] += ln
end
end
r.add_versions(versions)
r.save
end
end
end
2 changes: 1 addition & 1 deletion lib/miga/cli/action/download/gtdb.rb
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def sanitize_cli

def remote_list
cli.say 'Downloading genome list'
extra = ['sp_reps_only=' + cli[:reference].to_s]
extra = { sp_reps_only: cli[:reference].to_s }
json = MiGA::RemoteDataset.download(
:gtdb, :taxon, cli[:taxon], :genomes, nil, extra
)
Expand Down
111 changes: 43 additions & 68 deletions lib/miga/cli/action/download/ncbi.rb
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,8 @@ def cli_name_modifiers(opt)
'Do not add sequence version to the dataset name',
'Only affects --complete and --chromosome'
) { |v| cli[:add_version] = v }
cli.opt_flag(
opt, 'legacy-name',
'Use dataset names based on chromosome entries instead of assembly',
:legacy_name
)
# For backwards compatibility
cli.opt_flag(opt, 'legacy-name', '::HIDE::', :legacy_name)
end

def sanitize_cli
Expand All @@ -52,89 +49,67 @@ def sanitize_cli
end

def remote_list
doc =
if cli[:ncbi_table_file]
cli.say 'Reading genome list from file'
File.open(cli[:ncbi_table_file], 'r')
else
cli.say 'Downloading genome list'
url = remote_list_url
MiGA::RemoteDataset.download_url(url)
end
ds = parse_csv_as_datasets(doc)
doc.close if cli[:ncbi_table_file]
ds
list = {}
query = remote_list_query
loop do
# Query the remote collection
page = MiGA::Json.parse(
MiGA::RemoteDataset.download(:ncbi_datasets, :genome, query, :json),
contents: true
)
break unless page&.any? && page[:reports]&.any?

# Process reports in this page
list.merge!(parse_reports_as_datasets(page[:reports]))

# Next page
break unless page[:next_page_token]
query[:page_token] = page[:next_page_token]
end
list
end

def parse_csv_as_datasets(doc)
def parse_reports_as_datasets(reports)
ds = {}
CSV.parse(doc, headers: true).each do |r|
asm = r['assembly']
reports.each do |r|
asm = r[:accession]
next if asm.nil? || asm.empty? || asm == '-'

rep = remote_row_replicons(r)
n = remote_row_name(r, rep, asm)

# Register for download
n = remote_report_name(r, asm)
ds[n] = {
ids: [asm], db: :assembly, universe: :ncbi,
md: {
type: :genome, ncbi_asm: asm, strain: r['strain']
type: :genome, ncbi_asm: asm, strain: r.dig(:organism, :infraspecific_names, :strain)
}
}
ds[n][:md][:ncbi_nuccore] = rep.join(',') unless rep.nil?
unless r['release_date'].nil?
ds[n][:md][:release_date] = Time.parse(r['release_date']).to_s
end
date = r.dig(:assembly_info, :release_date)
ds[n][:md][:release_date] = Time.parse(date).to_s if date
ds[n][:md][:ncbi_dataset] = r
end
ds
end

def remote_row_replicons(r)
return if r['replicons'].nil?

r['replicons']
.split('; ')
.map { |i| i.gsub(/.*:/, '') }
.map { |i| i.gsub(%r{/.*}, '') }
end

def remote_row_name(r, rep, asm)
return r['#organism'].miga_name if cli[:legacy_name] && cli[:reference]

if cli[:legacy_name] && ['Complete', ' Chromosome'].include?(r['level'])
acc = rep.nil? ? '' : rep.first
else
acc = asm
end
def remote_report_name(r, asm)
acc = "#{asm}"
acc.gsub!(/\.\d+\Z/, '') unless cli[:add_version]
"#{r['#organism']}_#{acc}".miga_name
org = r.dig(:organism, :organism_name)
acc = "#{org}_#{acc}" if org
acc.miga_name
end

def remote_list_url
url_base = 'https://www.ncbi.nlm.nih.gov/genomes/solr2txt.cgi?'
url_param = {
q: '[display()].' \
'from(GenomeAssemblies).' \
'usingschema(/schema/GenomeAssemblies).' \
'matching(tab==["Prokaryotes"] and q=="' \
"#{cli[:taxon]&.tr('"', "'")}\"",
fields: 'organism|organism,assembly|assembly,replicons|replicons,' \
'level|level,release_date|release_date,strain|strain',
nolimit: 'on'
}
def remote_list_query
q = { taxons: [cli[:taxon]], filters: {} }
if cli[:reference]
url_param[:q] += ' and refseq_category==["representative"]'
q[:filters][:reference_only] = true
else
status = {
complete: 'Complete',
chromosome: ' Chromosome', # <- The leading space is *VERY* important!
scaffold: 'Scaffold',
contig: 'Contig'
}.map { |k, v| '"' + v + '"' if cli[k] }.compact.join(',')
url_param[:q] += ' and level==[' + status + ']'
q[:assembly_level] = {
contig: 'contig',
scaffold: 'scaffold',
chromosome: 'chromosome',
complete: 'complete_genome'
}.map { |k, v| '"' + v + '"' if cli[k] }.compact
end
url_param[:q] += ')'
url_base + URI.encode_www_form(url_param)
q
end
end
3 changes: 1 addition & 2 deletions lib/miga/cli/action/download/seqcode.rb
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,7 @@ def remote_list

while current_page <= total_pages
json = MiGA::RemoteDataset.download(
:seqcode, :'type-genomes', nil, :json, nil,
["page=#{current_page}"]
:seqcode, :'type-genomes', nil, :json, nil, page: current_page
)
doc = MiGA::Json.parse(json, contents: true)
current_page = doc[:current_page] + 1
Expand Down
9 changes: 1 addition & 8 deletions lib/miga/cli/action/ncbi_get.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@ class MiGA::Cli::Action::NcbiGet < MiGA::Cli::Action

def parse_cli
cli.defaults = {
query: false, unlink: false,
reference: false, legacy_name: false,
query: false, unlink: false, reference: false,
complete: false, chromosome: false,
scaffold: false, contig: false, add_version: true, dry: false,
get_md: false, only_md: false, save_every: 1
Expand All @@ -29,12 +28,6 @@ def parse_cli
'--api-key STRING',
'::HIDE::' # For backwards compatibility
) { |v| ENV['NCBI_API_KEY'] = v }
opt.on(
'--ncbi-table-file STRING',
'::HIDE::' # Only meant for debugging
# It can take the table returned by NCBI and parse it from a file
# instead of downloading it directly
) { |v| cli[:ncbi_table_file] = v }
opt.on(
'--ncbi-api-key STRING',
'NCBI API key'
Expand Down
21 changes: 15 additions & 6 deletions lib/miga/cli/action/wf.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ def default_opts_for_wf
cli.expect_files = true
cli.defaults = {
clean: false, project_type: :genomes, dataset_type: :popgenome,
ncbi_draft: true, min_qual: MiGA::Project.OPTIONS[:min_qual][:default],
ncbi_draft: true, ncbi_ref: false,
min_qual: MiGA::Project.OPTIONS[:min_qual][:default],
prepare_and_exit: false
}
end
Expand Down Expand Up @@ -39,14 +40,21 @@ def opts_for_wf(opt, files_desc, params = {})
'-T', '--ncbi-taxon STRING',
'Download all the genomes in NCBI classified as this taxon'
) { |v| cli[:ncbi_taxon] = v }
opt.on(
'--no-draft', '::HIDE::' # Deprecated
) { |v| cli[:ncbi_draft] = v }
opt.on(
'--ncbi-complete',
'Only download complete genomes, not drafts (requires -T)'
) { |v| cli[:ncbi_draft] = !v }
opt.on(
'--ncbi-ref',
'Only download RefSeq reference genomes (requires -T)'
) { |v| cli[:ncbi_ref] = v }
opt.on(
'-G', '--gtdb-taxon STRING',
'Download all the genomes in GTDB classified as this taxon'
) { |v| cli[:gtdb_taxon] = v }
opt.on(
'--no-draft',
'Only download complete genomes, not drafts (requires -T)'
) { |v| cli[:ncbi_draft] = v }
opt.on(
'--gtdb-ref',
'Only download reference anchor genomes in GTDB (requires -G)'
Expand Down Expand Up @@ -170,7 +178,8 @@ def initialize_empty_project(metadata)
def download_datasets
# Download datasets from NCBI
unless cli[:ncbi_taxon].nil?
what = cli[:ncbi_draft] ? '--all' : '--complete'
what = cli[:ncbi_ref] ? '--reference' :
cli[:ncbi_draft] ? '--all' : '--complete'
cmd = ['ncbi_get', '-P', cli[:outdir], '-T', cli[:ncbi_taxon], what]
cmd += ['--max', cli[:max_download]] if cli[:max_download]
call_cli(cmd)
Expand Down
3 changes: 3 additions & 0 deletions lib/miga/cli/objects_helper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ def load_and_filter_datasets(silent = false)
o &&= (d.ref? == self[:ref]) unless self[:ref].nil?
o &&= (d.active? == self[:active]) unless self[:active].nil?
o &&= (self[:multi] ? d.multi? : d.nonmulti?) unless self[:multi].nil?
unless self[:markers].nil?
o &&= (self[:markers] ? d.markers? : !d.markers?)
end
unless self[:taxonomy].nil?
o &&= !d.metadata[:tax].nil? && d.metadata[:tax].in?(self[:taxonomy])
end
Expand Down
10 changes: 8 additions & 2 deletions lib/miga/cli/opt_helper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def opt_common(opt)
'-h', '--help',
'Display this screen'
) do
puts opt.to_s.gsub(/^.*\s+::HIDE::\s*$/, '')
puts opt.to_a.select { |i| i !~ /\s::HIDE::\s/ }
exit
end
opt.separator ''
Expand Down Expand Up @@ -120,10 +120,11 @@ def opt_object(opt, what = %i[project dataset])
# as determined by +what+ an Array with any combination of:
# - :ref To filter by reference (--ref) or query (--no-ref)
# - :multi To filter by multiple (--multi) or single (--no-multi) species
# - :markers To filter by with (--markers) or without markers (--no-markers)
# - :active To filter by active (--active) or inactive (--no-active)
# - :taxonomy To filter by taxonomy (--taxonomy)
# The "k-th" filter (--dataset-k) is always included
def opt_filter_datasets(opt, what = %i[ref multi active taxonomy])
def opt_filter_datasets(opt, what = %i[ref multi markers active taxonomy])
what.each do |w|
case w
when :ref
Expand All @@ -136,6 +137,11 @@ def opt_filter_datasets(opt, what = %i[ref multi active taxonomy])
'--[no-]multi',
'Use only multi-species (or only single-species) datasets'
) { |v| self[:multi] = v }
when :markers
opt.on(
'--[no-]markers',
'Use only datasets with (or without) markers'
) { |v| self[:markers] = v }
when :active
opt.on(
'--[no-]active',
Expand Down
Loading

0 comments on commit a9c90f5

Please sign in to comment.