-
Notifications
You must be signed in to change notification settings - Fork 1
/
cldfbench_ewave.py
232 lines (205 loc) · 9.93 KB
/
cldfbench_ewave.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
import pathlib
import itertools
from pycldf import StructureDataset, Reference
from cldfbench import Dataset as BaseDataset
from cldfbench import CLDFSpec, Metadata
class MetadataWithTravis(Metadata):
def markdown(self):
lines, title_found = [], False
for line in super().markdown().split('\n'):
lines.append(line)
if line.startswith('# ') and not title_found:
title_found = True
lines.extend([
'',
"[![Build Status](https://travis-ci.org/cldf-datasets/ewave.svg?branch=master)]"
"(https://travis-ci.org/cldf-datasets/ewave)"
])
return '\n'.join(lines)
class Dataset(BaseDataset):
dir = pathlib.Path(__file__).parent
id = "ewave"
metadata_cls = MetadataWithTravis
def cldf_specs(self): # A dataset must declare all CLDF sets it creates.
return CLDFSpec(
dir=self.cldf_dir,
module='StructureDataset',
default_metadata_path=self.raw_dir / 'StructureDataset-metadata.json')
def cmd_download(self, args):
pass
def cmd_makecldf(self, args):
#
# Augment the schema of the rather simplistic CLDF download:
#
ds = args.writer.cldf
# Add tables for controlled vocabularies:
ds.add_table('regions.csv', 'ID', 'Name')
ds.add_table('varietytypes.csv', 'ID', 'Name', 'Description')
ds.add_table('featurecategories.csv', 'ID', 'Name', 'Description')
ds.add_table('contributors.csv', 'ID', 'Name', 'URL', 'Address', 'Email')
# We merge the data from contributions.csv into languages.csv for simplicity:
ds.remove_table('contributions.csv')
# Varieties have a region, a type, an abbreviation and contributors.
ds.add_columns(
'LanguageTable',
'Description',
'Region_ID',
'Type_ID',
'abbr',
{'name': 'Contributor_ID', 'separator': ' '})
ds['LanguageTable'].add_foreign_key('Region_ID', 'regions.csv', 'ID')
ds['LanguageTable'].add_foreign_key('Type_ID', 'varietytypes.csv', 'ID')
ds['LanguageTable'].add_foreign_key('Contributor_ID', 'contributors.csv', 'ID')
# Features have a category and a typical example, with source.
ds.add_columns(
'ParameterTable',
'Category_ID',
'Example_Source',
{
'name': 'Attestation',
'datatype': 'float',
'dc:description':
"Attestation is a relative measure of how widespread a feature is in the set "
"of eWAVE varieties. It is expressed as a percentage and is calculated as the "
"sum of all A-, B- and C-ratings for a feature, divided by the number of "
"varieties in the eWAVE dataset. The closer the value to 100%, the more "
"widespread the feature is.",
},
{
'name': 'Pervasiveness',
'datatype': 'float',
'dc:description': """\
Pervasiveness provides a measure of how pervasive a feature is on average in the varieties in
which it is attested. Pervasiveness is calculated as all A-ratings for a feature plus 0.6 times
the B-ratings for the same feature plus 0.3 times the C-ratings, divided by the sum of all
A-, B- and C-ratings for the feature. This value is then multiplied by 100 and expressed as a
percentage. A Pervasiveness value of 100% or close to 100% thus indicates that the feature is
highly pervasive (rated A) in all or most of the varieties for which it is attested, while a
value close to 30% (the lowest possible value) indicates that the feature is extremely rare
(rated C) in most or all of the varieties for which it is attested. Intermediate values are less
easy to interpret – here one has to look more closely at the ratio of A- to B- to C-values.
Two more things should also be noted here:
- The Pervasiveness value does not provide information on how widespread a feature is in the entire
eWAVE dataset, i.e. for how many varieties the feature is actually attested.
- Since the eWAVE contributors did not all use exactly the same strategies in deciding when to
give a feature an A- vs. a B- or a C- vs. a B- rating, it is very difficult to translate the
ratings into numerical values that adequately reflect the differences between A-, B- and
C-ratings. The choice made here (1 for A, 0.6 for B and 0.3 for C) is certainly only one of
many, and further testing is required to see how adequate this model is.
""",
},
)
ds['ParameterTable'].add_foreign_key('Category_ID', 'featurecategories.csv', 'ID')
# Values may have (many) examples:
ds.add_columns(
'ValueTable',
{
'name': 'Example_ID',
'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#exampleReference',
'separator': ' ',
}
)
# ... but no Contribution_ID anymore:
ds.remove_columns('ValueTable', 'Contribution_ID')
# Examples may have sources:
ds.add_columns(
'ExampleTable',
{
'name': 'Source',
'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source',
'separator': ';',
}
)
history = ds.add_table('history.csv', 'Version', 'Language_ID', 'Parameter_ID', 'Code_ID')
history.add_foreign_key('Language_ID', 'languages.csv', 'ID')
history.add_foreign_key('Parameter_ID', 'parameters.csv', 'ID')
history.add_foreign_key('Code_ID', 'codes.csv', 'ID')
#
# Now add the data:
#
ds.add_sources(self.raw_dir.read('sources.bib'))
args.writer.objects['varietytypes.csv'] = [
{'ID': r[0], 'Name': r[1], 'Description': r[2]}
for r in self.raw_dir.read_csv('varietytype.psv', delimiter='|')]
args.writer.objects['featurecategories.csv'] = [
{'ID': r[0], 'Name': r[1], 'Description': r[2]}
for r in self.raw_dir.read_csv('featurecategory.psv', delimiter='|')]
args.writer.objects['regions.csv'] = [
{'ID': r[0], 'Name': r[1]} for r in self.raw_dir.read_csv('region.psv', delimiter='|')]
for lid, pid, cid, _ in self.raw_dir.read_json('changes.json')['2013']:
args.writer.objects['history.csv'].append({
'Version': '1.0',
'Language_ID': lid,
'Parameter_ID': pid,
'Code_ID': '{0}-{1}'.format(pid, cid.replace('?', 'NA'))
})
for row in self.raw_dir.read_csv('contributors.csv', dicts=True):
#id, name, url, email, address
args.writer.objects['contributors.csv'].append({
'ID': row['id'],
'Name': row['name'],
'URL': row['url'],
'Email': row['email'],
'Address': row['address'],
})
# We read the bulk of the data from the CLDF export of the website:
raw_ds = StructureDataset.from_metadata(self.raw_dir / 'StructureDataset-metadata.json')
cc = {
cid: [r[1] for r in rows] for cid, rows in itertools.groupby(
sorted(
self.raw_dir.read_csv('cc.csv'),
key=lambda r: (int(r[0]), int(r[2]), int(r[1]))),
lambda r: r[0],
)
}
desc = {
r['ID']: r['Description']
for r in self.raw_dir.read_csv('contributions.csv', dicts=True)}
data = {r[0]: r[1:] for r in self.raw_dir.read_csv('variety.csv')}
for row in raw_ds['LanguageTable']:
row['Region_ID'] = data[row['ID']][0]
row['Type_ID'] = data[row['ID']][1]
row['abbr'] = data[row['ID']][2]
row['Description'] = desc[row['ID']]
row['Contributor_ID'] = cc[row['ID']]
args.writer.objects['LanguageTable'].append(row)
data = {r[0]: r[1:] for r in self.raw_dir.read_csv('feature.csv')}
for row in raw_ds['ParameterTable']:
row['Example_Source'] = data[row['ID']][0]
row['Category_ID'] = data[row['ID']][1]
row['Attestation'] = data[row['ID']][2]
row['Pervasiveness'] = data[row['ID']][3]
args.writer.objects['ParameterTable'].append(row)
# Augment examples.csv
def ref(r):
return str(Reference(r['source'], r['description'].replace('[', '(').replace(']', ')')))
examplesource = {
eid: [ref(r) for r in rows]
for eid, rows in itertools.groupby(
sorted(
self.raw_dir.read_csv('examplesource.csv', dicts=True),
key=lambda d: (int(d['example']), d['source'])),
lambda d: d['example']
)
}
for row in raw_ds['ExampleTable']:
row['Source'] = examplesource.get(row['ID'], [])
args.writer.objects['ExampleTable'].append(row)
# Renumber codes and values!
for row in raw_ds['CodeTable']:
row['ID'] = '{0}-{1}'.format(row['Parameter_ID'], row['Name'].replace('?', 'NA'))
args.writer.objects['CodeTable'].append(row)
valuesentence = {
vid: [r['sentence'] for r in rows]
for vid, rows in itertools.groupby(
sorted(
self.raw_dir.read_csv('valueexample.csv', dicts=True),
key=lambda d: (int(d['value']), int(d['sentence']))),
lambda d: d['value']
)
}
for row in raw_ds['ValueTable']:
row['Example_ID'] = valuesentence.get(row['ID'], [])
row['ID'] = '{0}-{1}'.format(row['Language_ID'], row['Parameter_ID'])
row['Code_ID'] = '{0}-{1}'.format(row['Parameter_ID'], row['Value'] or 'NA')
args.writer.objects['ValueTable'].append(row)