diff --git a/atlas/scaife_viewer/atlas/importers/treebanks.py b/atlas/scaife_viewer/atlas/importers/treebanks.py deleted file mode 100644 index b2cb458..0000000 --- a/atlas/scaife_viewer/atlas/importers/treebanks.py +++ /dev/null @@ -1,45 +0,0 @@ -""" -@@@ prepare for ingestion; -""" -import json - -from lxml import etree - - -def main(): - with open("tlg0012.tlg001.perseus-grc1.tb.xml") as f: - tree = etree.parse(f) - version = "urn:cts:greekLit:tlg0012.tlg001.perseus-grc2:" - to_create = [] - for sentence in tree.xpath(f"//sentence[@subdoc='1.1-1.7']"): - seen_urns = set() - sentence_obj = { - "urn": f'urn:cite2:exploreHomer:syntaxTree.v1:syntaxTree{sentence.attrib["id"]}', - "treebank_id": int(sentence.attrib["id"]), - "words": [], - } - for word in sentence.xpath(".//word"): - word_obj = { - "id": int(word.attrib["id"]), - "value": word.attrib["form"], - "head_id": int(word.attrib["head"]), - "relation": word.attrib["relation"], - } - cite = word.attrib["cite"] - if cite: - ref = cite.rsplit(":", maxsplit=1)[1] - seen_urns.add(f"{version}{ref}") - sentence_obj["words"].append(word_obj) - sentence_obj["references"] = sorted(list(seen_urns)) - to_create.append(sentence_obj) - - json.dump( - to_create, - open("syntax_trees_tlg0012.tlg001.perseus-grc2.json", "w"), - ensure_ascii=False, - indent=2, - ) - - -if __name__ == "__main__": - main() diff --git a/atlas/scaife_viewer/atlas/importers/treebanks_gorman.py b/atlas/scaife_viewer/atlas/importers/treebanks_gorman.py deleted file mode 100644 index fdec0d4..0000000 --- a/atlas/scaife_viewer/atlas/importers/treebanks_gorman.py +++ /dev/null @@ -1,70 +0,0 @@ -""" -@@@ prepare for ingestion; -""" -import json -import re - -from lxml import etree - -from scaife_viewer.atlas.backports.scaife_viewer.cts.utils import natural_keys -from scaife_viewer.atlas.urn import URN - - -def main(): - with open("gorman-plato.xml") as f: - tree = etree.parse(f) - - version = "urn:cts:greekLit:tlg0059.tlg002.perseus-grc2:" - to_create = [] - counter = 0 - for sentence in tree.xpath(f"//sentence"): - counter += 1 - seen_urns = set() - sentence_obj = { - "urn": f'urn:cite2:exploreHomer:syntaxTree.v1:syntaxTree{sentence.attrib["id"]}', - "treebank_id": int(sentence.attrib["id"]), - "words": [], - } - for word in sentence.xpath(".//word"): - word_obj = { - "id": int(word.attrib["id"]), - "value": word.attrib["form"], - "head_id": int(word.attrib["head"]), - "relation": word.attrib["relation"], - } - sentence_obj["words"].append(word_obj) - - # TODO: Consider constructing URNs from document_id - # cite = word.attrib.get("cite") - # if cite: - # ref = cite.rsplit(":", maxsplit=1)[1] - # seen_urns.add(f"{version}{ref}") - subdoc = sentence.attrib.get("subdoc") - if subdoc: - ref = re.match(r"\d+", subdoc).group() - seen_urns.add(f"{version}{ref}") - - references = sorted(seen_urns, key=lambda x: natural_keys(x)) - sentence_obj["references"] = references - # TODO: Resolve citation; for now, we'll just use the subdoc - # citation = "" - # if references: - # citation = URN(references[0]).passage - # if len(references) > 1: - # citation = f"{citation}-{URN(references[-1]).passage}" - sentence_obj.update( - {"references": references, "citation": subdoc,} - ) - - to_create.append(sentence_obj) - - json.dump( - to_create, - open("syntax_trees_tlg0059.tlg002.perseus-grc2.json", "w"), - ensure_ascii=False, - indent=2, - ) - - -if __name__ == "__main__": - main() diff --git a/atlas/scaife_viewer/atlas/importers/treebanks_ud.py b/atlas/scaife_viewer/atlas/importers/treebanks_ud.py deleted file mode 100644 index bbb2152..0000000 --- a/atlas/scaife_viewer/atlas/importers/treebanks_ud.py +++ /dev/null @@ -1,58 +0,0 @@ -""" -@@@ prepare for ingestion; -""" -import json - -import conllu - - -def main(): - # FIXME: Need to add conllu to deps - path = "grc_perseus-ud-perseus-grc2.conllu" - data = conllu.parse(open(path).read()) - - version = "urn:cts:greekLit:tlg0085.tlg001.perseus-grc2:" - meta = {} - to_create = [] - counter = 0 - for sentence in data: - counter += 1 - meta.update(sentence.metadata) - new_obj = {} - new_obj.update(meta) - - seen_urns = set() - sentence_id = int(new_obj["sent_id"].split("@")[1]) - - sentence_obj = { - "urn": f"urn:cite2:exploreHomer:syntaxTree.v1:syntaxTree{sentence_id}", - "treebank_id": sentence_id, - "words": [], - } - for token in sentence: - word_obj = { - "id": token["id"], - "value": token["form"], - "head_id": token["head"], - "relation": token["deprel"], - } - sentence_obj["words"].append(word_obj) - - # TODO: can't do cite or refs just yet, which will be required - # This is likely something we could do from that sent_id as another - # kind of lookup - sentence_obj.update( - {"references": [], "citation": str(sentence_id),} - ) - to_create.append(sentence_obj) - - json.dump( - to_create, - open("syntax_trees_tlg0085.tlg001.perseus-grc2.json", "w"), - ensure_ascii=False, - indent=2, - ) - - -if __name__ == "__main__": - main()