From 10beb0cdcbccd3216999665666a720eb6236a4ca Mon Sep 17 00:00:00 2001 From: colin combe Date: Thu, 8 Aug 2024 11:21:33 +0100 Subject: [PATCH 1/2] different way of getting peptides --- app/routes/xiview.py | 66 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 64 insertions(+), 2 deletions(-) diff --git a/app/routes/xiview.py b/app/routes/xiview.py index 79d5a3f..e01ce48 100644 --- a/app/routes/xiview.py +++ b/app/routes/xiview.py @@ -126,7 +126,7 @@ async def get_data_object(ids, pxid): data["matches"] = await get_matches(cur, ids) # data["peptides"] = await get_peptides(cur, data["matches"], ids) # data["proteins"] = await get_proteins(cur, data["peptides"]) - data["peptides"] = await get_all_peptides(cur, ids) + data["peptides"] = await get_peptides2(cur, ids) data["proteins"] = await get_all_proteins(cur, ids) cur.close() except (Exception, psycopg2.DatabaseError) as e: @@ -241,7 +241,6 @@ async def get_matches(cur, ids): INNER JOIN submodpep mp2 ON si.pep2_id = mp2.id AND si.upload_id = mp2.upload_id WHERE si.upload_id = ANY(%s) AND si.pass_threshold = TRUE - AND si.rank = 1 AND mp1.link_site1 > 0 AND mp2.link_site1 > 0;""" cur.execute(query, [ids, ids]) @@ -435,6 +434,69 @@ async def get_all_peptides(cur, ids): cur.execute(query, [ids, ids]) return cur.fetchall() +@log_execution_time_async +@xiview_data_router.get('/get_xiview_peptides2', tags=["xiVIEW"]) +async def get_xiview_peptides2(project, file=None): + """ + Get all the peptides. + URLs have the following structure: + https: // www.ebi.ac.uk / pride / archive / xiview / get_xiview_peptides?project=PXD020453&file=Cullin_SDA_1pcFDR.mzid + Users may provide only projects, meaning we need to have an aggregated view. + https: // www.ebi.ac.uk / pride / archive / xiview / get_xiview_peptides?project=PXD020453 + + :return: json of the peptides + """ + logger.info(f"get_xiview_peptides for {project}, file: {file}") + most_recent_upload_ids = await get_most_recent_upload_ids(project, file) + + conn = None + data = {} + error = None + + try: + conn = await get_db_connection() + cur = conn.cursor(cursor_factory=RealDictCursor) + data = await get_peptides2(cur, most_recent_upload_ids) + cur.close() + except (Exception, psycopg2.DatabaseError) as e: + logger.error(e) + return {"error": "Database error"}, 500 + finally: + if conn is not None: + conn.close() + + start_time = time.time() + json_bytes = orjson.dumps(data) + logger.info(f'peptides json dump time: {time.time() - start_time}') + log_json_size(json_bytes, "peptides") # slows things down a little + return Response(json_bytes, media_type='application/json') + + +@log_execution_time_async +async def get_peptides2(cur, ids): + query = """with submatch as (select pep1_id, pep2_id, upload_id from match where upload_id = ANY(%s) and pass_threshold = true), +pep_ids as (select pep1_id, upload_id from submatch union select pep2_id, upload_id from submatch), +subpp AS (select * from peptideevidence WHERE upload_id = ANY(%s)) +select mp.id, + cast(mp.upload_id as text) AS u_id, + mp.base_sequence AS seq, + array_agg(pp.dbsequence_id) AS prt, + array_agg(pp.pep_start) AS pos, + array_agg(pp.is_decoy) AS dec, + mp.link_site1 AS ls1, + mp.link_site2 AS ls2, + mp.mod_accessions as m_as, + mp.mod_positions as m_ps, + mp.mod_monoiso_mass_deltas as m_ms, + mp.crosslinker_modmass as cl_m from pep_ids pi +inner join modifiedpeptide mp on mp.upload_id = pi.upload_id and pi.pep1_id = mp.id + JOIN subpp AS pp + ON mp.id = pp.peptide_id AND mp.upload_id = pp.upload_id + GROUP BY mp.id, mp.upload_id, mp.base_sequence;""" + + cur.execute(query, [ids, ids]) + return cur.fetchall() + @log_execution_time_async @xiview_data_router.get('/get_xiview_proteins', tags=["xiVIEW"]) From 58e882120cfe6c8e2ceac955e2da6c6c5c28d5f6 Mon Sep 17 00:00:00 2001 From: colin combe Date: Tue, 13 Aug 2024 11:22:35 +0100 Subject: [PATCH 2/2] make xiview load matches with linksite = 0 (bug fix?); and minor change to matches SQL query that might speed it up --- app/routes/xiview.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/app/routes/xiview.py b/app/routes/xiview.py index e01ce48..0e04781 100644 --- a/app/routes/xiview.py +++ b/app/routes/xiview.py @@ -224,7 +224,7 @@ async def get_results_metadata(cur, ids): async def get_matches(cur, ids): # todo - check whats going on with this rank =1 and pass_threshold = True in mascot data, rank =1 condition seems to speeds things up (but should be redundant) # todo - rename 'si' to 'm' - query = """WITH submodpep AS (SELECT * FROM modifiedpeptide WHERE upload_id = ANY(%s)) + query = """WITH submodpep AS (SELECT * FROM modifiedpeptide WHERE upload_id = ANY(%s) AND link_site1 > -1) SELECT si.id AS id, si.pep1_id AS pi1, si.pep2_id AS pi2, si.scores AS sc, cast (si.upload_id as text) AS si, @@ -241,8 +241,8 @@ async def get_matches(cur, ids): INNER JOIN submodpep mp2 ON si.pep2_id = mp2.id AND si.upload_id = mp2.upload_id WHERE si.upload_id = ANY(%s) AND si.pass_threshold = TRUE - AND mp1.link_site1 > 0 - AND mp2.link_site1 > 0;""" + AND mp1.link_site1 > -1 + AND mp2.link_site1 > -1;""" cur.execute(query, [ids, ids]) return cur.fetchall()