-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract_uniprot_info.py
186 lines (156 loc) · 7.2 KB
/
extract_uniprot_info.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
import xml.etree.ElementTree as ET
import pandas as pd
import requests
import gzip
import io
import os
path = 'path/to/folder'
file = "interactors_stringDB_ID.xlsx"
column = "uniprot_ID_proteinB"
def get_rest_api(api_url):
call = requests.get(api_url, headers={'Accept-Encoding': 'gzip'})
# Check the response status code
if call.status_code == 200:
# Extract the content of the response
content = call.content
# Decompress the content using gzip
decompressed_content = gzip.decompress(content)
# Convert the decompressed content to a string
data = decompressed_content.decode('utf-8')
# string to dataframe
df = pd.read_csv(io.StringIO(data), sep="\t")
return df
# keyword API from uniprot
print("Getting Keywords from Uniprot ...")
keyword_url = "https://rest.uniprot.org/keywords/stream?compressed=true&fields=id%2Cname%2Ccategory%2Cgene_ontologies&format=tsv&query=%28%2A%29%20AND%20%28category%3A"
biol_processes = get_rest_api(keyword_url + "biological_process%29")
mol_function = get_rest_api(keyword_url + "molecular_function%29")
cell_comp = get_rest_api(keyword_url + "cellular_component%29")
# join both dfs + export
selected_keywords = pd.concat([biol_processes, mol_function, cell_comp])
selected_keywords.to_excel(path + "selected_keywords.xlsx")
# create list with Keyword IDs
kw_list = selected_keywords["Keyword ID"]
# function to import xlsx file with accession numbers, returns all gathered infos
def uniprotlist(path_to_folder, filename, column_name):
print("Gathering info from Uniprot ...")
df_og = pd.read_excel(io=path_to_folder + filename, engine="openpyxl")
# remove duplicates
df = df_og.drop_duplicates(subset = "uniprot_ID_proteinB")
# remove NAs
df = df.dropna(axis = 'index')
# reset index, OG index is kept in "string_index"
df.reset_index(inplace=True)
# list of accession numbers
id_list = df[column_name]
# get info for each ID
result_df = get_all_info(id_list)
# join results with input df
export_df = df.iloc[:, 2:len(df.columns)].join(result_df.iloc[:, 1:len(result_df.columns)])
return export_df
# function to get information from a list of accession numbers
def get_all_info(id_list):
result_df = pd.DataFrame()
for ID in id_list:
print(ID)
# adds all information gathered with import_xml to new column in result_df
# following line did work for small datasets, but gave performance warning for large dataset
# result_df[ID] = pd.Series(import_xml(ID), name=ID)
result_df = pd.concat((result_df, pd.Series(import_xml(ID), name=ID)), axis=1)
# transpose df -> row per accession number and add column titles
result_df = result_df.transpose()
result_df = result_df.reset_index(level=0)
result_df.columns = ["ID",
"Gene Name",
"Protein Name",
"Species",
"EC Number",
"Uniprot Keyword",
"GO IDs",
"GO terms",
"dummy"] # !!! edit this according to extracted info!
return result_df
# function to import xml data for each accession number
def import_xml(accession_number, keyword_list=kw_list):
url = "https://www.uniprot.org/uniprot/" + accession_number + ".xml"
r = requests.get(url)
xml_data = r.text
# Parse the XML data
root = ET.fromstring(xml_data)
# Find the entry element
entry = root.find(add_uniprot_url('./entry'))
# extract certain information, based on path to xml
gene_name = get_info(entry, add_uniprot_url('./gene/name[@type="primary"]'))
protein_name = get_info(entry, add_uniprot_url('./protein/recommendedName/fullName'))
species = get_info(entry, add_uniprot_url('./organism/name[@type="scientific"]'))
ec_number = get_info(entry, add_uniprot_url('./protein/recommendedName/ecNumber'))
# several keywords for each entry
keywords = str()
for key in keyword_list: # extract only relevant ones according to kw_list
temp = get_info(entry, add_uniprot_url('./keyword[@id="' + key + '"]'))
# if there is an entry, add string to keywords
if not pd.isna(temp):
keywords = keywords + "; " + temp
# remove first ", "
keywords = keywords[2:len(keywords)]
# several GO terms for each entry, get GO no
GO_ids = get_GO_terms(entry, add_uniprot_url('./dbReference[@type = "GO"]'), "id")
GO_terms = get_GO_terms(entry, add_uniprot_url('./dbReference/property[@type = "term"]'), "term")
add_info = "dummy"
return [gene_name, protein_name, species, ec_number, keywords, GO_ids, GO_terms, add_info]
# function to add "{http://uniprot.org/uniprot}" to xml paths
def add_uniprot_url(xml_path):
long_path = xml_path.replace("/", "/{http://uniprot.org/uniprot}")
return long_path
# general function to get info from uniprot entry based on the given path
def get_info(entry, xml_path):
# if there is no entry, e.g. no ecNumber since protein is not an enzyme
if not entry.findall(xml_path):
# print("no attribute found")
extracted_info = float("NaN")
else:
for info in entry.findall(xml_path):
# print(info.text)
extracted_info = info.text
return extracted_info
def get_GO_terms(entry, xml_path, type):
# gather all GO terms or IDs in string object
retrieved_GO = str()
if not entry.findall(xml_path):
# print("no attribute found")
retrieved_GO = float("NaN")
else:
for info in entry.findall(xml_path):
if type == "id":
extracted_info = info.get('id')
retrieved_GO += "; " + extracted_info
if type == "term":
extracted_info = info.get('value')
retrieved_GO += "; " + extracted_info
# remove first ", "
retrieved_GO = retrieved_GO[2:len(retrieved_GO)]
return retrieved_GO
# function to extract all rows containing a specific value in a specific column, used to identify GPCRs
def extract_rows(df, column, value):
return df[(df[column].notnull()) & (df[column].str.contains(value))]
# removes rows from df when one of the values are found in specific column, used to remove GPCRs from "all"
def remove_rows(df, column, value):
return df[(df[column].notnull()) & (df[column].str.contains(value)==False)]
# function to export result, main function
def export_xlsx(path_to_folder, filename, column_name):
result_df = uniprotlist(path_to_folder, filename, column_name)
print("Saving results ...")
# split table into GPCR and non-GPCR proteins
gpcr_df = extract_rows(result_df, 'Uniprot Keyword', 'G-protein coupled receptor')
nogpcr_df = remove_rows(result_df, 'Uniprot Keyword', 'G-protein coupled receptor')
new_folder = path_to_folder.replace("/uniprot_ID/", "/uniprot_details/")
try:
os.mkdir(new_folder)
except FileExistsError:
pass
result_df.to_excel(new_folder + filename[:-5] + "_all.xlsx")
gpcr_df.to_excel(new_folder + filename[:-5] + "_gpcrs.xlsx")
nogpcr_df.to_excel(new_folder + filename[:-5] + "_nogpcrs.xlsx")
return print("Finished")
# execute
export_xlsx(path, file, column)