-
Notifications
You must be signed in to change notification settings - Fork 0
/
scripts.py
79 lines (63 loc) · 2.28 KB
/
scripts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
"""Contains assorted scripts that have been used on the data-dump over time."""
from db import *
from parsers import PiecePage
from scraper import get_dl_path
from sqlalchemy.orm.exc import NoResultFound
import json
import os
import shutil
def isolate_reiner_files(target_folder):
"""Isolate the pieces and files Reiner was originally interested in.
Args:
target_folder: Absolute path to directory to put files in.
Returns: None
"""
session = DB_SESSION()
file_paths = []
with open('motets.json') as f:
motets = json.load(f)
with open('renaissance.json') as f:
renaissance = json.load(f)
motets.extend(renaissance)
sources = motets
missed = 0
for url in sources:
rel_url = '/wiki/' + url.split('/wiki/')[-1]
try:
piece = session.query(Piece).filter(Piece.url == rel_url).one()
except NoResultFound:
missed += 1
continue
for score in piece.scores:
file_paths.append(score.file_path)
target_folder = '/mnt/choral/reiner_files/'
# missed == 13
# len(file_paths) == 26560
for file_path in file_paths:
if not file_path:
continue
new_path = os.path.join(target_folder, file_path.split('/mnt/choral/downloads/')[-1])
shutil.copytree(file_path, new_path)
def re_parse_metadata():
"""Re parses the metadata of every piece.
This is necessary because I forgot to include some special handling to parse
movement names from pieces that have movements. These are not very standard, but
the information is necessary.
"""
session = DB_SESSION()
for piece in session.query(Piece):
# Initialization for this loop
url, raw_html = piece.url, piece.html_dump
if not piece.json_metadata:
continue
old_metadata = json.loads(piece.json_metadata)
# Re parse the page
parser = PiecePage(url, raw_html)
new_metadata = parser.parse_metadata()
new_metadata['scores'] = old_metadata['scores']
new_metadata['url'] = piece.url
# Output the new results
with open(os.path.join(get_dl_path(old_metadata), 'meta.json'), 'w') as f:
json.dump(new_metadata, f)
if __name__ == '__main__':
re_parse_metadata()