-
Notifications
You must be signed in to change notification settings - Fork 2
/
COVID19DataPortal_XMLFromBBMRIDirectory.py
executable file
·131 lines (104 loc) · 6.54 KB
/
COVID19DataPortal_XMLFromBBMRIDirectory.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/local/bin/python3.6
"""
Script for creating XML for COVID19DataPortal from Directory collections
"""
#############
## Imports ##
#############
# External
import pprint
import argparse
import xml.etree.ElementTree as ET
# Internal
from directory import Directory
cachesList = ['directory', 'geocoding']
#####################
## Parse arguments ##
#####################
parser = argparse.ArgumentParser()
parser.add_argument('--purge-all-caches', dest='purgeCaches', action='store_const', const=cachesList, help='disable all long remote checks (directory and geocoding)')
parser.add_argument('-o', '--outName', dest='outNameExcludedBiobanks', default='bbmri-directory-missing-COVID19DataPortal-Values.tsv', help='Output file name')
parser.add_argument('-x', '--outNameXML', dest='outXML', default='bbmriDirectory_Covid19DataPortal.xml', help='Output XML name')
parser.add_argument('-d', '--debug', dest='debug', action='store_true', help='debug information on progress of the data checks')
parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', help='verbose information on progress of the data checks')
parser.add_argument('-p', '--password', dest='password', help='Password of the account used to login to the Directory')
parser.add_argument('-u', '--username', dest='username', help='Username of the account used to login to the Directory')
parser.add_argument('-P', '--package', dest='package', default='eu_bbmri_eric', help='MOLGENIS Package that contains the data (default eu_bbmri_eric).')
parser.set_defaults(disableChecksRemote = [], disablePlugins = [], purgeCaches=[])
args = parser.parse_args()
# Open output tsv file (did not add the path as option, so it always writes in the working directory):
outFile = open(args.outNameExcludedBiobanks, 'w')
# Get info from Directory
pp = pprint.PrettyPrinter(indent=4)
if args.username is not None and args.password is not None:
dir = Directory(package=args.package, purgeCaches=args.purgeCaches, debug=args.debug, pp=pp, username=args.username, password=args.password)
else:
dir = Directory(package=args.package, purgeCaches=args.purgeCaches, debug=args.debug, pp=pp)
### Functions
# From https://stackoverflow.com/questions/749796/pretty-printing-xml-in-python/38573964#38573964
def prettify(element, indent=' '):
queue = [(0, element)] # (level, element)
while queue:
level, element = queue.pop(0)
children = [(level + 1, child) for child in list(element)]
if children:
element.text = '\n' + indent * (level+1) # for child open
if queue:
element.tail = '\n' + indent * queue[0][0] # for sibling open
else:
element.tail = '\n' + indent * (level-1) # for parent close
queue[0:0] = children # prepend so children come before siblings
### Main
# XML Level 1 - Database
xml_doc = ET.Element('database')
# Inside level 1
ET.SubElement(xml_doc, 'name').text = 'BBMRI-ERIC Directory'
ET.SubElement(xml_doc, 'description').text = 'The BBMRI-ERIC Directory is a tool that collects and makes available information about biobanks throughout Europe that are willing to share their data and/or samples, and to collaborate with other research groups. The Directory is one of the services offered by BBMRI-ERIC Common Services for IT (CS-IT) to the global biobank community and was created in collaboration with the BBMRI National Nodes and partners. The Directory provides a central listing of biobanks and their collections in the BBMRI-ERIC member states. For researchers, the Directory offers a means of finding samples and data, while for biobanks it offers a platform to share the existence of their holdings and services and to connect with researchers interested in them.'
# Level 2 - entries
entries = ET.SubElement(xml_doc, 'entries')
# Inside level 2
# Level 3 - entry
### For collection in colls
counter = 0
for coll in dir.getCollections():
# For the moment capability collections are not included:
if 'Covid-19' in str(coll['categories']) and 'Ability to ' not in coll['name']:
# Inside level 3
try:
colldescrip = coll['description'].strip().replace('\n', ' ')
except KeyError:
# For the moment, only collections with values for mandatory fields are going to be included. If the collection is excluded its country, biobank and name are written in a separate file:
outFile.write(coll['country']['id'] +'\t'+ coll['biobank']['name'] +'\t'+ coll['name'] +'\n')
continue
try:
colldate = coll['timestamp']
except KeyError:
# For the moment, only collections with values for mandatory fields are going to be included. If the collection is excluded its country, biobank and name are written in a separate file:
outFile.write(coll['country']['id'] +'\t'+ coll['biobank']['name'] +'\t'+ coll['name'] +'\n')
continue
try:
collurl = coll['url']
except KeyError:
# For the moment, only collections with values for mandatory fields are going to be included. If the collection is excluded its country, biobank and name are written in a separate file:
try:
collurl = coll['biobank']['url']
except KeyError:
outFile.write(coll['country']['id'] +'\t'+ coll['biobank']['name'] +'\t'+ coll['name'] +'\n')
continue
counter += 1
entry = ET.SubElement(entries, 'entry', id=coll['id'])
ET.SubElement(entry, 'name').text = coll['name']
ET.SubElement(entry, 'description').text = colldescrip
dates = ET.SubElement(entry, 'dates')
ET.SubElement(dates, 'date', type='submission').text = colldate.split('T')[0]
ET.SubElement(entry, 'repository').text = coll['biobank']['name']
# EG-A: from the COVID19DataPortal documentation, I understand this as a external link, thus this is the one I am including here.
ET.SubElement(entry, 'full_dataset_link').text = collurl
#ET.SubElement(entry, 'full_dataset_link').text = 'https://directory.bbmri-eric.eu/#/collection/' + coll['id']
ET.SubElement(xml_doc, 'entry_count').text = str(counter)
ET.SubElement(xml_doc, 'keywords').text = 'biobank,samples,bbmri-directory'
ET.SubElement(xml_doc, 'url').text = 'https://directory.bbmri-eric.eu/'
# Write XML output
prettify(xml_doc)
tree = ET.ElementTree(xml_doc)
tree.write(args.outXML, encoding='UTF-8', xml_declaration=True)