-
Notifications
You must be signed in to change notification settings - Fork 0
/
google-code-archive-backuper.py
140 lines (120 loc) · 5.36 KB
/
google-code-archive-backuper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/usr/bin/env python3
import requests
import json
import gzip
import subprocess
from os import path, makedirs, listdir, unlink,chdir
from typing import Any, List
from datetime import datetime
from tempfile import TemporaryDirectory
from copy import deepcopy
from zipfile import ZipFile
from shutil import move
def _format_url(
bucket: str = "google-code-archive",
domain: str = "code.google.com",
project: str = "earthsurfer",
file: str = "project.json",
) -> str:
return f"https://storage.googleapis.com/{bucket}/v2/{domain}/{project}/{file}"
def archive(
project_name: str,
domain: str = "code.google.com",
do_gzip: bool = True,
) -> None:
'''
Note: it gets converted into a github-similar format
if you want to archive <https://code.google.com/archive/p/earthsurfer/> call archive("earthsurfer")
the "domain" argument can be "code.google.com", "eclipselabs.org", or "apache-extras.org"
'''
base_path: str = path.join(".", "google-code", domain, project_name)
makedirs(base_path, exist_ok=True)
base_path = path.abspath(base_path)
response: requests.Response = requests.get(_format_url(domain=domain, project=project_name, file="project.json"))
response.raise_for_status()
project_meta = response.json()
_write_gzipable_json(path.join(base_path, "original_project.json"), project_meta, do_gzip=do_gzip)
issue_id: int = 1
makedirs(path.join(base_path, "issues"), exist_ok=True)
while True:
response = requests.get(_format_url(domain=domain, project=project_name, file=f"issues/issue-{issue_id}.json"))
if response.status_code != 200:
break
issue = response.json()
_write_gzipable_json(
path.join(base_path, "issues", f"{issue['id']}.json"),
{
"title": issue["summary"],
"state": {
"new": "open",
"accepted": "closed",
}[issue["status"].lower()],
"labels": issue["labels"],
"reactions": {"+1": issue["stars"]},
"comments": [{
"user": comment["commenterId"], # TODO: figure out how to get name (especially since this id is project specific (wtf google))
"body": comment["content"],
"created_at": datetime.fromtimestamp(comment["timestamp"]).strftime('%Y-%m-%dT%H:%M:%SZ'), # dont ask me weather `timestamp` is creation or edit date..
} for comment in issue["comments"]],
},
do_gzip=do_gzip,
)
issue_id += 1
del issue_id
# the git clone is completely borked and in general 90% dosnt work.
# this source-code (+ sometimes history) is the only working method i found (except for svndump, which only works with svn repos)
with TemporaryDirectory() as tmpdir:
zip_file = path.join(tmpdir, "source.zip")
_download_file(
_format_url(bucket="google-code-archive-source", domain=domain, project=project_name, file="source-archive.zip"),
zip_file,
gzip_result=False,
)
start_dir: str = deepcopy(str(path.curdir))
chdir(tmpdir) # "zf.extractall" "pwd" argument does not work (thanks python)
with ZipFile(zip_file, "r") as zf:
zf.extractall()
chdir(start_dir)
subdir_name: str = [i for i in listdir(tmpdir) if i != "source.zip"][0]
if path.isdir(gitdir := path.join(tmpdir, subdir_name, ".git")):
subprocess.run(
["git", "clone", "--mirror", gitdir, "git"],
cwd=base_path,
)
else:
move(zip_file, path.join(base_path, "google-code-archive-source.zip"))
# WIKIs dont even work on the website https://code.google.com/archive/p/earthsurfer/wikis
# DOWNLOADs dont even work on the website https://code.google.com/archive/p/earthsurfer/downloads
def _download_file(url: str, local_file: str, gzip_result: bool = False) -> None:
"""
gzip_result will just gzip the result without questions.
change the filename and check for duplicate compression at the other end.
"""
# https://stackoverflow.com/a/16696317
with requests.get(url, stream=True) as r:
r.raise_for_status() # TODO: handle
if gzip_result:
with gzip.open(local_file, "wb") as fp:
for chunk in r.iter_content(chunk_size=8192):
fp.write(chunk)
else:
with open(local_file, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
def _write_gzipable_json(filepath: str, jsondata: Any, do_gzip: bool = True) -> None:
if do_gzip:
with gzip.open(f"{filepath}.gz", "wt") as fp:
json.dump(jsondata, fp)
else:
with open(filepath, "w") as fp:
json.dump(jsondata, fp)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(
prog="github-repo-backuper",
description="Back up a github repository",
)
parser.add_argument("project_name", help="Name of the repository. Example: earthsurfer", type=str)
parser.add_argument("--domain", type=str, default="code.google.com", help="Domain it was published under")
args = parser.parse_args()
archive(project_name=args.project_name, domain=args.domain, do_gzip=True)