Skip to content

Commit

Permalink
Create sequencing experiments
Browse files Browse the repository at this point in the history
- Improve linking of source metadata
- Centralize long (modal-extensible) lists
- Test for raw data availability of genomes
- Reduce requests to IRNMG (when output is empty)
  • Loading branch information
lmrodriguezr committed Nov 11, 2024
1 parent 3e099c4 commit 79071af
Show file tree
Hide file tree
Showing 25 changed files with 463 additions and 86 deletions.
2 changes: 1 addition & 1 deletion app/controllers/genomes_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def update_accession

# POST /genomes/1/update_external
def update_external
if @genome.queue_for_external_resources
if @genome.queue_for_external_resources(true) # Force: trust curators
flash[:notice] = 'Update has been queued'
sleep(2)
else
Expand Down
19 changes: 19 additions & 0 deletions app/helpers/application_helper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,25 @@ def list_type_selector
end
end

def longer_list(title = '', hide_over = 3, elements = nil, &blk)
content_tag(:div) { elements = blk.call } if block_given?
content_tag(:ul, class: 'mb-1') do
if hide_over && elements.count > hide_over
id = modal(title) { longer_list('', nil, elements) }
content_tag(:li) { elements[0] } +
content_tag(:li) do
modal_button(id, as_anchor: true) do
"And #{elements.count - 1} more…".html_safe
end
end
else
elements.each_with_index.map do |content, k|
content_tag(:li) { content }
end.inject(:+)
end
end
end

def current_contributor?
current_user.try :contributor?
end
Expand Down
45 changes: 45 additions & 0 deletions app/models/genome.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@ class Genome < ApplicationRecord
:updated_by, optional: true,
class_name: 'User', foreign_key: 'updated_by_id'
)
has_many(:genome_sequencing_experiments, dependent: :destroy)
has_many(:sequencing_experiments, through: :genome_sequencing_experiments)

before_validation(:standardize_source)
after_save(:monitor_source_changes)
after_save(:link_sequencing_experiments!)

validates(:database, presence: true)
validates(:accession, presence: true)
Expand Down Expand Up @@ -255,6 +258,30 @@ def source_attributes
@source_attributes
end

def biosample_accessions
case source_database.try(:to_sym)
when :sra
source_hash.try(:dig, :samples).try(:keys) || []
when :biosample
source_accessions
end
end

def sra_accessions
case source_database.to_sym
when :sra
source_accessions.unique
when :biosample
SequencingExperiment.by_biosample(source_accessions)
.pluck(:sra_accession).unique
end
end

#def sequencing_experiments
# @sequencing_experiments ||=
# SequencingExperiment.by_biosample(biosample_accessions)
#end

def link(acc = nil)
acc ||= accession
case database
Expand Down Expand Up @@ -369,6 +396,24 @@ def recalculate_miga!
update(auto_scheduled_at: nil, auto_failed: nil, auto_check: false)
end

def link_sequencing_experiments!
self.class.transaction do
# Unlink experiments that shouldn't be here
sequencing_experiments.each do |experiment|
unless biosample_accessions.include?(experiment.biosample_accession)
GenomeSequencingExperiment
.where(genome: self, sequencing_experiment: experiment)
.map(&:destroy!)
end
end

# Link experiments that should be here
self.sequencing_experiments +=
SequencingExperiment.where(biosample_accession: biosample_accessions)
.where.not(id: sequencing_experiments.pluck(:id))
end
end

private

def standardize_source
Expand Down
71 changes: 28 additions & 43 deletions app/models/genome/external_resources.rb
Original file line number Diff line number Diff line change
Expand Up @@ -22,62 +22,47 @@ def reload_source_json!
case source_database.to_sym
when :sra
source_accessions.each do |acc|
external_sra_to_biosamples(acc).each do |biosample|
data[biosample] ||=
{ from_sra: [] }.merge(external_biosample_hash(biosample))
data[biosample][:from_sra] << acc
end
biosample = external_sra_to_biosample(acc)
data[biosample] ||=
{ from_sra: [] }.merge(external_biosample_hash(biosample))
data[biosample][:from_sra] << acc
end
when :biosample
source_accessions.each do |acc|
data[acc] = external_biosample_hash(acc)
external_biosample_to_sra(acc)
end
end

update_column(:queued_external, nil)
update_column(
:source_json, { retrieved_at: DateTime.now, samples: data }.to_json
)
self.queued_external = nil
self.source_json = { retrieved_at: DateTime.now, samples: data }.to_json
save
end

##
# Find all BioSample accessions linked to the SRA entry +acc+ and return as
# Array (typically one value)
def external_sra_to_biosamples(acc)
uri = "https://www.ebi.ac.uk/ena/browser/api/xml/#{acc}?includeLinks=false"
# Find BioSample accession linked to the SRA entry +acc+ and return as
# String (or +nil+)
def external_sra_to_biosample(acc)
SequencingExperiment.by_sra(acc).try(:biosample_accession)
end

##
# Find SRA entries linked to the BioSample +acc+ and return as Array
def external_biosample_to_sra(acc)
uri = "https://www.ebi.ac.uk/ena/browser/api/xml/ebisearch?" +
"query=BIOSAMPLE:#{acc}&includeLinks=true&domain=sra-experiment"
body = external_request(uri)
return [] unless body && body != '{}'
return unless body.present?

ng = Nokogiri::XML(body)
if ng.xpath('//RUN_SET').present?
ng.xpath(
'//RUN_SET/RUN/RUN_LINKS/RUN_LINK/' \
'XREF_LINK[DB[text() = "ENA-SAMPLE"]]/ID'
).map(&:text)
elsif ng.xpath('//EXPERIMENT_SET').present?
# Unfortunately, we should prefer external IDs over primary IDs because
# NCBI E-Utils has a strange tendency to return the wrong biosample when
# using SRS... accessions. For example, see:
#
# - https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=biosample
# &id=SRS22988103&rettype=xml&retmode=text
# - https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=biosample
# &id=SAMN13193749&rettype=xml&retmode=text
#
# The first is using the accession SRS22988103 but it (wrongly) retrieves
# data for SAMN22988103 (= SRS11001113). Apparently the backend code
# simply strips off the alphabetic prefix and uses the numeric part
# without checking
sample_id =
ng.xpath(
'//EXPERIMENT_SET/EXPERIMENT/DESIGN/SAMPLE_DESCRIPTOR/IDENTIFIERS'
)
biosample_id =
sample_id.xpath('EXTERNAL_ID[@namespace="BioSample"]').map(&:text)
biosample_id.present? ? biosample_id :
sample_id.xpath('PRIMARY_ID').map(&:text)
else
[] # Unknown XML specification
ng.xpath('//EXPERIMENT_SET/EXPERIMENT').map do |exp|
sra_acc = exp['accession'] || exp.xpath('IDENTIFIERS/PRIMARY_ID').text
SequencingExperiment.find_or_create_by(sra_accession: sra_acc) do |se|
se.external_reuse_metadata_xml = true
se.queued_external = nil
se.retrieved_at = DateTime.now
se.metadata_xml = "<EXPERIMENT_SET>\n#{exp.to_s}\n</EXPERIMENT_SET>"
end
end
end

Expand Down
4 changes: 4 additions & 0 deletions app/models/genome_sequencing_experiment.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
class GenomeSequencingExperiment < ApplicationRecord
belongs_to :genome
belongs_to :sequencing_experiment
end
22 changes: 16 additions & 6 deletions app/models/has_external_resources.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,27 @@ def queued_for_external_resources
end

##
# Queue name for +NameExternalResourcesJob+
def queue_for_external_resources
# Queue name for +NameExternalResourcesJob+, possibly forcing queue even if
# it was recently submitted (+force+)
def queue_for_external_resources(force = false)
return if Rails.configuration.bypass_external_apis
return if queued_for_external_resources
return if !force && queued_for_external_resources

external_resources_job.perform_later(self)
update_column(:queued_external, DateTime.now)
end

##
# Forget about interrupted updates
def force_reset_external_resources
update_column(:queued_external, nil)
end

##
# Generate a request to the external +uri+, and return the reponse body
# if successful or +nil+ otherwise (fails silently)
def external_request(uri)
# if successful or +nil+ otherwise (fails silently). If the return code
# is 204 (empty contents), returns +empty+
def external_request(uri, empty = nil)
return if Rails.configuration.bypass_external_apis

require 'uri'
Expand All @@ -30,7 +38,9 @@ def external_request(uri)
Rails.logger.error "External Request #{uri} returned #{res}"
return nil
end
res.body ? normalize_encoding(res.body) : nil

res.is_a?(Net::HTTPNoContent) ? empty :
res.body ? normalize_encoding(res.body) : nil
rescue
nil
end
Expand Down
2 changes: 1 addition & 1 deletion app/models/name/external_resources.rb
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def external_search(service)
end

uri = send("#{service}_search_uri")
body = external_request(uri)
body = external_request(uri, '{}')

if body.present?
send("#{service}_json=", body)
Expand Down
27 changes: 27 additions & 0 deletions app/models/name/quality_checks.rb
Original file line number Diff line number Diff line change
Expand Up @@ -633,6 +633,11 @@ class QcWarning
message: 'The longest contig in the type genome should have at least ' \
'100 kbp',
recommendations: %w[appendix-i]
}.merge(@@link_to_edit_genome),
missing_source_data: {
message: 'The raw data should be available in INSDC databases ' \
'(e.g., Sequence Read Archive)',
rules: %w[appendix-i]
}.merge(@@link_to_edit_genome)
}

Expand Down Expand Up @@ -910,6 +915,28 @@ def qc_warnings
@qc_warnings.add(:short_largest_contig)
end

if type_genome.sequencing_experiments.empty?
# Before this date, source metadata is not linked to SRA and it should
# be re-retrieved
link_date = [DateTime.parse('2024-11-12'), 2.hours.ago].min
if type_genome.source_hash.present? &&
type_genome.source_hash[:retrieved_at] > link_date
if proposed_in.present? && proposed_in.journal_date.year < 2023
# Only a warning for publications before 1st January 2023
@qc_warnings.add(
:missing_source_data,
rules: [],
recommendations: %w[appendix-i]
)
else
@qc_warnings.add(:missing_source_data)
end
else
# Update the metadata and try again later if it has a source
type_genome.queue_for_external_resources if type_genome.source?
end
end

# Measure discrepancy with automated checks
Genome.fields_with_auto.each do |field|
next if field == :quality
Expand Down
62 changes: 62 additions & 0 deletions app/models/sequencing_experiment.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
class SequencingExperiment < ApplicationRecord
has_many(:genome_sequencing_experiments, dependent: :destroy)
has_many(:genomes, through: :genome_sequencing_experiments)

validates(:sra_accession, presence: true, uniqueness: true)
before_validation(:load_from_sra_accession)

include HasExternalResources
include SequencingExperiment::ExternalResources

class << self
def by_biosample(acc)
SequencingExperiment.where(biosample_accession: acc)
end

def by_sra(acc)
SequencingExperiment.find_or_create_by(sra_accession: acc)
end
end

def link
return unless sra_accession.present?

"https://www.ncbi.nlm.nih.gov/sra/#{sra_accession}[accn]"
end

def metadata_dom
return unless metadata_xml
@metadata_dom ||= Nokogiri::XML(metadata_xml)
end

def metadata_xpath(path)
metadata_dom&.xpath(path)
end

def title
metadata_xpath('//EXPERIMENT_SET/EXPERIMENT/TITLE')&.text || sra_acccession
end

def bioproject_accession
@bioproject_accession ||= nil
return @bioproject_accession unless @bioproject_accession.nil?

study = metadata_xpath('//EXPERIMENT_SET/EXPERIMENT/STUDY_REF')&.first
@bioproject_accession ||=
study&.xpath(
'IDENTIFIERS/EXTERNAL_ID[@namespace="BioProject"]'
)&.first&.text || ''
end

def bioproject_link
return unless bioproject_accession.present?

"https://www.ncbi.nlm.nih.gov/bioproject/#{bioproject_accession}"
end

private

def load_from_sra_accession
reload_metadata!
end
end
Loading

0 comments on commit 79071af

Please sign in to comment.