-
Notifications
You must be signed in to change notification settings - Fork 0
/
parseJson2.py
executable file
·1339 lines (1194 loc) · 45.1 KB
/
parseJson2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
# encoding UTF-8
# File: parseJson2.py
# Author: Pavel Raur (xraurp00@stud.fit.vutbr.cz)
# Project: wikidata2
# Description: Parses wikidata json dump to tsv. Uses classes as entity ids.
# Builds relation graph of entities and sorts type according to the most specific class.
import argparse
import hashlib # generate temp file name
import json # load data from dump
import os # filesystem
import re # find ids for substitution
import sys # stderr, exit, ...
import time # generate temp file name, timestamps
import traceback # for printing exceptions
# get script name
SCRIPT_NAME = os.path.basename(sys.argv[0])
def get_args():
"""
Parses arguments from command line.
:return: parsed arguments
"""
argparser = argparse.ArgumentParser(
"Parses wikidata json dump to tsv and parses wikidata classes relations."
)
argparser.add_argument(
"-f",
"--input-file",
help="Input file to process.",
required=True,
type=argparse.FileType("r"),
)
argparser.add_argument(
"-o",
"--output-file",
help="TSV output file.",
required=True,
type=argparse.FileType("w"),
)
argparser.add_argument(
"-d",
"--dict-file",
help="Output dictionary file, with entity names.",
required=False,
)
argparser.add_argument(
"-c",
"--class-relations-dump",
help="Class relations dump file.",
required=False,
)
argparser.add_argument(
"--show-missing",
help="Display ids with missing translation.",
required=False,
default=False,
action="store_true",
)
argparser.add_argument(
"-b",
"--buffer-level",
help="Specifies how much memory buffering is used. "
"Level 0: No buffering. "
"Level 1: Buffer dictionary. "
"Level 2: Buffer dictionary and parsed entities.",
required=False,
default=0,
type=int,
)
argparser.add_argument(
"--full-paths",
help="Sets type of entities to show full path to entity class instead of just list of all types.",
required=False,
default=False,
action="store_true",
)
argparser.add_argument(
"--keep-root-class",
help="If this option is used, root class is not removed from list of ids in type of entity.",
required=False,
default=False,
action="store_true",
)
argparser.add_argument(
"--root-class-id",
help="Defines root class id that will be removed from list of ids in type of entity."
" Default: Q35120 (entity)",
required=False,
default="Q35120",
)
argparser.add_argument(
"--no-cleanup",
help="Disable automatic removal of temporary files.",
required=False,
default=False,
action="store_true",
)
# create group for arguments that should not be given together
# this automatically handles bad argument combinations
parsing_restrictions_group = argparser.add_mutually_exclusive_group()
parsing_restrictions_group.add_argument(
"-n",
"--number-of-entities",
help="Parse only given number of entities.",
required=False,
type=int,
default=None,
)
parsing_restrictions_group.add_argument(
"-l",
"--line",
help="Parse only line with given number.",
required=False,
type=int,
default=None,
)
# mutually exclusive group for parsing sections
parsing_sections = argparser.add_mutually_exclusive_group()
parsing_sections.add_argument(
"--parse-only",
help="Only parse wikidata dump to tsv, without type and name substitution. Can also generate class relations "
"dump and dictionary file.",
default=False,
action="store_true",
)
parsing_sections.add_argument(
"--substitute-type-only",
help="Only substitute type of entities. Input must be already parsed dump without translated names. "
"Class relations dump must be specified!",
default=False,
action="store_true",
)
parsing_sections.add_argument(
"--substitute-names-only",
help="Translates wikidata ids for entity names. Input file must be already parsed dump. Dictionary file must "
"be specified!",
default=False,
action="store_true",
)
return argparser.parse_args()
class WikidataDumpManipulator:
"""
Includes some common functions needed for manipulating wikidata dump
"""
@staticmethod
def gen_multival_field(*args):
"""
Generates multiple value field
:param args: list of input arguments
:return: string where arguments are separated by |
"""
return "|".join(arg for arg in args if arg)
@staticmethod
def write_entity_to_tsv(entity, file):
"""
Writes entity to tsv file
:param entity: entity to write
:param file: file where entity will be written to
"""
for i in range(0, len(entity) - 1):
file.write(entity[i] + "\t") # add tabs between fields
file.write(entity[-1] + "\n") # add eol after last field
class WikidataDumpParser(WikidataDumpManipulator):
"""
Parses wikidata dump and builds class relations
"""
def __init__(
self,
input_file,
output_file=None,
dict_file=None,
lang="en",
class_relations_builder=None,
):
# files
self.input_file = input_file
self.output_file = output_file
self.dict_file = dict_file
# data structures
self.dictionary = {} # dictionary for name substitution
self.entities = [] # list of entities used if buffer_entities is true
self.class_relations_builder = (
class_relations_builder # class relations builder instance
)
# counters
self.line_number = 1
self.processed_records = 0
self.corrupted_records = 0
# setup
self.lang = lang # parsed labels language
self.buffer_entities = False # store parsed entities in memory
self.buffer_dictionary = False # store dictionary in memory
self.max_entities = 0 # maximum number of entities to parse (0 = disabled)
self.dump_line = 0 # dump single line (0 = disabled)
@property
def max_entities(self):
try:
return self._max_entities
except AttributeError: # if used before value is set
return 0
@max_entities.setter
def max_entities(self, value):
if type(value) != int:
raise TypeError("Value has to be int!")
if self.dump_line:
raise ValueError("dump_line and max_entities are mutually exclusive!")
self._max_entities = value
@property
def dump_line(self):
try:
return self._dump_line
except AttributeError: # if used before value is set
return 0
@dump_line.setter
def dump_line(self, value):
if type(value) != int:
raise TypeError("Value has to be int!")
if self.max_entities:
raise ValueError("dump_line and max_entities are mutually exclusive!")
if self.max_entities < 0:
raise ValueError("Value has to be higher or equal to 0!")
self._dump_line = value
def parse_record(self, record):
"""
Parses one json record from wikidata dump file
:param record: record converted to dict
:return: parsed entity
"""
"""
Parsed data:
0 ID
1 INSTANCE OF (MULTIPLE VALUES) (TYPE of entity)
2 NAME
3 DISAMBIGUATION NAME
4 ALIASES (MULTIPLE VALUES)
5 DESCRIPTION
6 ROLES (MULTIPLE VALUES)
7 FICTIONAL
8 WIKIPEDIA URL
9 WIKIDATA URL
10 DBPEDIA URL
11 IMAGES (MULTIPLE VALUES)
"""
entity = ["", "", "", "", "", "", "", "", "", "", "", ""]
# id
if "id" in record:
entity[0] = record["id"]
if not entity[0]: # sometimes id is empty
return None
else: # fail if identifier is missing
return None
# name
if "labels" in record and self.lang in record["labels"]:
entity[2] = record["labels"][self.lang]["value"]
# aliases
if "aliases" in record and self.lang in record["aliases"]:
for value in record["aliases"][self.lang]:
entity[4] = self.gen_multival_field(entity[4], value["value"])
# description
if "descriptions" in record and self.lang in record["descriptions"]:
entity[5] = record["descriptions"][self.lang]["value"]
# instance of and image
if "claims" in record:
if "P31" in record["claims"]: # P31 == instance of
for statement in record["claims"]["P31"]:
try:
if (
statement["mainsnak"]["datavalue"]["value"]["entity-type"]
== "item"
):
entity[1] = self.gen_multival_field(
entity[1],
statement["mainsnak"]["datavalue"]["value"]["id"],
)
except KeyError:
# no such value in the current statement - skip
pass
if "P18" in record["claims"]: # P18 == image
for picture in record["claims"]["P18"]:
try:
# commonsMedia = is name of media file (picture)
if (
picture["mainsnak"]["datavalue"]["datatype"]
== "commonsMedia"
):
entity[11] = self.gen_multival_field(
entity[11], picture["mainsnak"]["datavalue"]["value"]
)
except KeyError:
# can't extract image name - value not present - skip
pass
# add relation to relations builder
if "P279" in record["claims"]: # P279 == subclass of
for statement in record["claims"]["P279"]:
try:
# subclass of
if (
statement["mainsnak"]["datavalue"]["value"]["entity-type"]
== "item"
):
if self.class_relations_builder:
self.class_relations_builder.add_ancestor(
record["id"], # entity id
statement["mainsnak"]["datavalue"]["value"]["id"],
) # id of related entity
except KeyError:
# can't extract related class - value is not present - skip
pass
# wikipedia url
if "sitelinks" in record and self.lang + "wiki" in record["sitelinks"]:
try:
entity[8] = (
"http://"
+ self.lang
+ ".wikipedia.org/wiki/"
+ "_".join(record["sitelinks"][self.lang + "wiki"]["title"].split())
)
except KeyError: # title not found, record is corrupted
entity[8] = ""
# wikidata url
entity[9] = "https://www.wikidata.org/wiki/" + entity[0]
return entity
def parse_wikidump(self):
"""
Parses wikidata dump file to tsv and creates dictionary for name substitution
:return: execution status
"""
for line in self.input_file:
if self.dump_line: # dump single line
if self.line_number != self.dump_line:
self.line_number += 1
continue
if (
len(line) > 2
): # first and last lines will be empty (contains only bracket removed by line = line[:-1])
if line[-2] == ",":
line = line[:-2] # remove comma and newline
else:
line = line[
:-1
] # last record doesn't have comma, remove only newline
try:
record = json.loads(line) # convert to dictionary
except json.JSONDecodeError:
self.corrupted_records += 1
else:
entity = self.parse_record(record)
if entity is None: # entity missing id
self.corrupted_records += 1
else:
if self.output_file: # write entity to output file
self.write_entity_to_tsv(entity, self.output_file)
if self.buffer_entities: # add to memory list of entities
self.entities.append(entity)
# generate dictionary file to replace IDs for names
# written fields: entity[0] == id, entity[2] == name
if entity[2]: # check if name is not empty
if self.dict_file: # add to dictionary file
self.write_entity_to_tsv(
[entity[0], entity[2]], self.dict_file
)
if (
self.buffer_dictionary
): # add to memory dictionary for name substitution
self.dictionary[entity[0]] = entity[2]
self.processed_records += 1
# if dumping single line, break cycle after done
if self.dump_line:
break
# maximum number of entities reached
elif self.max_entities and self.max_entities <= self.processed_records:
break
self.line_number += 1
return 0
class WikidataNameInterchanger(WikidataDumpManipulator):
"""
Substitutes wikidata ids for names in parsed dump
"""
def __init__(
self,
output_file,
input_file=None,
dict_file=None,
dictionary=None,
dump=None,
show_missing=False,
exclude=(0, 8, 9, 10, 11),
remove_missing=False,
):
# files
self.output_file = (
output_file # output file where translated data will be written to
)
self.input_file = input_file # input file with entities for translation
self.dict_file = dict_file # file with dictionary used for translation
# memory
self.dictionary = dictionary # dictionary buffered in memory
self.dump = dump # dump buffered in memory
self.show_missing = show_missing # enables additional info collecting
# ids without translation are added to this list if show_missing == True
# id without translation is any id from kb that is not found in dictionary
self.ids_without_translation = []
# removes ids with missing name instead of keeping them in KB
self.remove_missing = remove_missing
# excluded field indexes
self.excluded = exclude # array or tuple of indexes of fields where names will not be substituted
# self.excluded: useful for site urls, ids, picture/file names, etc.
@property
def show_missing(self):
return self._show_missing
@show_missing.setter
def show_missing(self, value):
if type(value) is not bool:
raise TypeError("Value must be bool!")
else:
self._show_missing = value
def substitute_names(self):
"""
Substitutes ids in parsed records for names matching this id from dictionary file.
Note that if both input_file and dump variables are set, function will rather use file
than memory dump buffer! To use buffer, don't set the input file variable!
:raise ValueError if data source is not set
:return: execution status
"""
if self.input_file: # input is read from file
input_data = self.input_file
elif self.dump: # input is read from list in memory
input_data = self.dump
else:
raise ValueError("Input data source is not set up!")
if not self.dictionary: # load dictionary from file
self.load_dict_from_file()
for line in input_data:
if type(line) == str: # if input is tsv file line, split it to list by tabs
line = line[:-1] # remove newline from end of the line
line = line.split("\t")
for i in range(len(line)):
if (
i in self.excluded
): # skip entity id, urls, file names, etc. (indexes defined in self.excluded)
continue
values = line[i].split("|") # split multiple value fields
results = [] # id translations
for value in values:
# values that are not wikidata ids are directly added to the results
if not re.fullmatch("^Q\d+$", value):
results.append(value)
continue
try: # translate id
results.append(self.dictionary[value])
except KeyError: # entity name is not in dictionary
if (
self.show_missing
and value not in self.ids_without_translation
):
self.ids_without_translation.append(value)
# add id to result if ids with missing translation should not be removed
if not self.remove_missing:
results.append(value)
line[i] = "|".join(results) # join results back to line field
self.write_entity_to_tsv(line, self.output_file)
return 0
def load_dict_from_file(self):
"""
Loads dictionary from file
:raise ValueError if dictionary file is not set or if dictionary is loaded already
"""
if not self.dict_file:
raise ValueError("Dictionary file not set!")
if self.dictionary:
raise ValueError("Dictionary already full!")
else:
self.dictionary = {}
for line in self.dict_file:
line = line[:-1] # remove newline
line = line.split("\t")
self.dictionary[line[0]] = line[1]
return 0
class ClassRelationsBuilder(WikidataDumpManipulator):
"""
Builds class relation graph and replaces type of entities by class paths.
"""
def __init__(self, classes=None, dump_file=None, output_file=None):
"""
Initializes class dictionary
:param classes: dictionary with existing class relations
:param dump_file: file with dump of class relations
:param output_file: file where to put processed entities
"""
self.classes = classes if classes else {} # dictionary with class relations
self.dump_file = dump_file # file with dump of class relations
self.output_file = output_file # file where to put processed entities
def add_class(self, class_id):
"""
Adds new class to dictionary.
:class_id: id of the class
"""
if class_id not in self.classes:
self.classes[class_id] = {}
self.classes[class_id]["successors"] = []
self.classes[class_id]["ancestors"] = []
def add_ancestor(self, class_id, ancestor_id):
"""
Adds new ancestor to class relations.
:class_id: id of class where ancestor will be added to
:ancestor_id: ancestor class id
"""
if class_id not in self.classes: # create new class item in dictionary
self.add_class(class_id)
# add ancestor class to list of ancestors
self.classes[class_id]["ancestors"].append(ancestor_id)
def add_successor(self, class_id, successor_id):
"""
Adds new successor to class relations.
:class_id: id of class where successor will be added to
:successor_id: successor class id
"""
if class_id not in self.classes: # create new class item in dictionary
self.add_class(class_id)
# add successor class to list of successors
self.classes[class_id]["successors"].append(successor_id)
def clear_successors(self, class_id):
"""
Removes all successors of the class.
:class_id: id of class where successors will be removed
"""
if class_id in self.classes:
self.classes[class_id]["successors"] = []
def clear_ancestors(self, class_id):
"""
Removes all ancestors of the class.
:class_id: id of class where ancestors will be removed
"""
if class_id in self.classes:
self.classes[class_id]["ancestors"] = []
def remove_class(self, class_id):
"""
Removes class from dictionary.
:class_id: id of class to remove
"""
if class_id in self.classes:
self.classes.pop(class_id)
def dump(self):
"""
Writes class relations dictionary to json file.
Doesn't do anything if file is not set.
:raise IOError if fails to write to file
"""
if self.dump_file:
json.dump(self.classes, self.dump_file, indent=4, sort_keys=True)
def load_dump(self, dump_file):
"""
Loads class relations from dump file
:param dump_file: dump_file descriptor
:raise IOError if fails to read from file
"""
self.classes = json.loads(dump_file.read())
def get_path_to_class(self, current_class, closed_nodes):
"""
Returns path from root class to current class (recursively)
:param current_class: class for which path is returned
:param closed_nodes: classes that are successors of current class, used for cyclic dependency check
:raise NameError if class is not found
:return: list of all paths to current class from root entity
"""
# class data are not parsed / class is root class or have no ancestors
if (
current_class not in self.classes
or len(self.classes[current_class]["ancestors"]) <= 0
):
return [current_class]
# class is its successor with cyclic dependency to itself
if current_class in closed_nodes:
return [current_class]
closed_nodes.append(current_class) # append current class to closed
all_paths = [] # all paths to this class
for ancestor in self.classes[current_class]["ancestors"]:
paths_to_ancestor = self.get_path_to_class(
ancestor, closed_nodes
) # get all paths to ancestor
for (
path
) in paths_to_ancestor: # append path to paths and add current class to end
all_paths.append(path + "->" + current_class)
closed_nodes.pop() # pop current class from the list before return
return all_paths
def get_full_paths(self, entity_type):
"""
Returns list of paths from root class to direct types of entity
:param entity_type: type of entity (class that entity is instance of)
:return: list of all paths from root class to direct types of entity sorted by classes in path
"""
# list of paths
paths = []
# get all paths to each type
for t in entity_type:
paths.extend(self.get_path_to_class(t, []))
# sort by path
paths.sort()
# remove duplicates
if len(paths) > 1:
i = 1
while i < len(paths):
if paths[i] == paths[i - 1]:
paths.pop(i)
else:
i += 1
return paths
def get_parent_classes(self, current_class, depth, closed_nodes):
"""
Returns current class and all parent classes on the path to the the root class
Includes class depth for sorting
:param current_class: class for which parent is returned
:param depth: depth of current class
:param closed_nodes: classes that are successors of current class and will not be expanded (detects cycles)
:return: list of all parents of current class and their depth
"""
classes = [
[current_class, depth]
] # classes to return (with current class added)
# class data are not parsed / class is root class or have no ancestors
if (
current_class not in self.classes
or len(self.classes[current_class]["ancestors"]) <= 0
):
return classes
# class is its successor with cyclic dependency to itself
if current_class in closed_nodes:
return classes
closed_nodes.append(current_class) # append current class to closed
for ancestor in self.classes[current_class]["ancestors"]: # add all parents
classes.extend(self.get_parent_classes(ancestor, depth + 1, closed_nodes))
closed_nodes.pop() # pop current class from the list before return
return classes
def get_all_parents(self, entity_type, remove_root_class, root_class):
"""
Returns sorted list of classes according to how specific is the type represented by class
to current entity
:param entity_type: type of entity (class that entity is instance of)
:param remove_root_class: if true, root class will be removed from list
:param root_class: root class id
:return: list of all types sorted according to how far is the type from entity (specificity)
"""
# list of all types
new_types = []
# add all parents of direct types to new types
for t in entity_type:
new_types.extend(self.get_parent_classes(t, 0, []))
# FORMAT:
# [ class_name , depth_number ]
# sort according to the type name
new_types.sort(key=lambda x: x[0])
# NOTE:
# following constructions can't use for cycles
# because size of list can change during the iteration
# remove root class from list
if remove_root_class and root_class:
i = 0
while i < len(new_types):
if new_types[i][0] == root_class:
new_types.pop(i)
else:
i += 1
# remove duplicates, keep only the those with lowes depth
if len(new_types) >= 2:
i = 1
while i < len(new_types):
if new_types[i][0] == new_types[i - 1][0]:
if new_types[i][1] > new_types[i - 1][1]:
new_types.pop(i)
else:
new_types.pop(i - 1)
else:
i += 1
# sort according to the depth (from highest to lowest)
new_types.sort(key=lambda x: x[1])
# remove depth numbers before return
return [t[0] for t in new_types]
def replace_types_of_entities(
self, entities, full_path=False, remove_root_class=False, root_class=None
):
"""
Replaces type of each entity on given field number by complete path to entity class.
:param entities: entities in array or file descriptor to file with entities
:param full_path: defines if type will be paths to entity class from root class
or just list of all types and supertypes
:param remove_root_class: if true, root class will be removed from list (used only if full_paths=False)
:param root_class: root class id (used only if full_paths=False)
"""
file = False
for entity in entities:
if (
type(entity) == str
): # if input is tsv file line, split it to list by tabs
entity = entity[:-1] # remove newline from end of the line
entity = entity.split("\t")
file = True
types = entity[1].split(
"|"
) # 1 == type of entity (see parser documentation)
if full_path:
types = self.get_full_paths(types)
else:
types = self.get_all_parents(types, remove_root_class, root_class)
types = "|".join(types)
entity[1] = types
if file:
self.write_entity_to_tsv(entity, self.output_file)
def gen_temp_file(file_path, mode, tag=""):
"""
Generates temporary file in folder given by path
:param file_path: path to the file that will be appended before the name
:param mode: mode how to open file (read/write/...), compatible with build-in open() command
makes sense only for modes that will create file, because it doesnt exists before opening
:param tag: string that is added to the name before suffix - hash_tag.suffix
:return: opened temp file handle
"""
if not os.path.isdir(file_path):
return None
# generate unique file name
file_name = (
hashlib.sha1(str(time.time()).encode()).hexdigest() + "_" + str(tag) + ".temp"
)
while os.path.exists(os.path.join(file_path, file_name)):
file_name = (
hashlib.sha1(str(time.time()).encode() + file_name.encode()).hexdigest()
+ "_"
+ str(tag)
+ ".temp"
)
# return file handle
return open(os.path.join(file_path, file_name), mode)
def parse_only(args):
"""
Parse wikidata dump to tsv, do not substitute type and entity names
:param args: arguments parsed by argparse
:return: execution status
"""
# open files
class_relations_dump = None
dict_file = None
if args.class_relations_dump:
try:
class_relations_dump = open(args.class_relations_dump, "w")
except Exception:
sys.stderr.write(
SCRIPT_NAME
+ ": Failed to open class relations dump! Handled error: "
+ str(traceback.format_exc())
+ "\n"
)
args.output_file.close()
args.input_file.close()
return 1
if args.dict_file:
try:
dict_file = open(args.dict_file, "w+")
except Exception:
sys.stderr.write(
SCRIPT_NAME
+ ": Failed to open dict file! Handled error: "
+ str(traceback.format_exc())
+ "\n"
)
args.output_file.close()
args.input_file.close()
if args.class_relations_dump:
class_relations_dump.close()
return 1
relations_builder = ClassRelationsBuilder(dump_file=class_relations_dump)
parser = WikidataDumpParser(
input_file=args.input_file,
output_file=args.output_file,
dict_file=dict_file,
class_relations_builder=relations_builder,
)
return_code = 0
try:
start_time = time.time()
parser.parse_wikidump() # parse wikidata dump to tsv and dictionary
relations_builder.dump() # dump entity relations to file
end_time = time.time()
print("Processed entities: " + str(parser.processed_records))
print("Corrupted entities: " + str(parser.corrupted_records))
print("Start: " + time.ctime(start_time))
print("End: " + time.ctime(end_time))
print(
"Total execution time: "
+ time.strftime("%H:%M:%S", time.gmtime(end_time - start_time))
)
except Exception:
sys.stderr.write(
SCRIPT_NAME
+ ": Failed to parse wikidata dump! Handled error:\n"
+ str(traceback.format_exc())
+ "\n"
)
return_code = 1
finally:
args.output_file.close()
args.input_file.close()
if args.dict_file:
dict_file.close()
if args.class_relations_dump:
class_relations_dump.close()
return return_code
def substitute_names_only(args):
"""
Substitutes names in already parsed dump
:param args: arguments parsed by argparse
:return: execution status
"""
if not args.dict_file:
sys.stderr.write("Dict file not set! Cannot substitute names!\n")
return 2
else:
try:
dict_file = open(args.dict_file, "r")
except Exception:
sys.stderr.write(
SCRIPT_NAME
+ ": Failed to open dictionary file! Handled error:\n"
+ str(traceback.format_exc())
+ "\n"
)
args.output_file.close()
args.input_file.close()
return 1
name_changer = WikidataNameInterchanger(
input_file=args.input_file,
dict_file=dict_file,
output_file=args.output_file,
show_missing=args.show_missing,
)
return_code = 0
try:
start_time = time.time()
name_changer.substitute_names()
end_time = time.time()
if args.show_missing:
print(
"Number of entities without name: "
+ str(len(name_changer.ids_without_translation))
)
for entity_id in name_changer.ids_without_translation:
sys.stdout.write(str(entity_id) + " ")
sys.stdout.write("\n")
print("Start: " + time.ctime(start_time))
print("End: " + time.ctime(end_time))
print(
"Total execution time: "
+ time.strftime("%H:%M:%S", time.gmtime(end_time - start_time))
)
except Exception:
sys.stderr.write(
SCRIPT_NAME
+ ": Failed to substitute ids for names! Handled error:\n"
+ str(traceback.format_exc())
+ "\n"
)
return_code = 1
finally:
args.output_file.close()
args.input_file.close()
dict_file.close()
return return_code
def substitute_type_only(args):
"""
Substitutes types of entities in parsed dump without ids translated to names
:param args: arguments parsed by argparse
:return: execution status
"""
if not args.class_relations_dump:
sys.stderr.write("Relations dump not set! Cannot substitute types!\n")
return 2
else:
try:
class_relations_dump = open(args.class_relations_dump, "r")
except Exception:
sys.stderr.write(
SCRIPT_NAME
+ ": Failed to open dictionary file! Handled error:\n"
+ str(traceback.format_exc())
+ "\n"
)
args.output_file.close()
args.input_file.close()
return 1