-
Notifications
You must be signed in to change notification settings - Fork 0
/
dnisort.py
executable file
·88 lines (67 loc) · 2.54 KB
/
dnisort.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
import re
import argparse
from argparse import RawTextHelpFormatter
parser = argparse.ArgumentParser(description="""
Sorts a csv database of D'ni words (or any wordlist for that matter) in D'ni
alphabetical order, using NTS or Dnifont encoding (OTS is too irregular to
sort properly).
The program prints to the standard output; to save to a file use
$ dnisort.py csvfile.csv > output.csv
""", formatter_class=RawTextHelpFormatter)
parser.add_argument("csv", help="CSV database wordlist", action="store")
parser.add_argument("-f", "--dfont", help="use the Dnifont format", action="store_true")
args = parser.parse_args()
def dfont2nts(in_dfont):
out_nts = in_dfont.replace("c", "ç")
out_nts = out_nts.replace("x", "c")
out_nts = out_nts.replace("k", "x")
out_nts = out_nts.replace("K", "k")
out_nts = out_nts.replace("d", "ð")
out_nts = out_nts.replace("D", "d")
out_nts = out_nts.replace("S", "š")
out_nts = out_nts.replace("T", "þ")
out_nts = out_nts.replace("I", "á")
out_nts = out_nts.replace("A", "é")
out_nts = out_nts.replace("E", "í")
out_nts = out_nts.replace("O", "ó")
out_nts = out_nts.replace("U", "ú")
out_nts = out_nts.replace("å", "æ")
return out_nts
# sorting string
sortstring = "vbtsšjgyxkaáfpiíeérmþðdhoóçwuúclæzn"
#if args.dfont: sortstring = "vbtsSjgykKaIfpiEeArmTdDhoOcwuUxlåzn"
list_unord = {}
list_index = []
tabs = []
if args.csv == "":
parser.print_usage()
sys.exit()
csv_file = open(args.csv, 'rU')
header = csv_file.readline().rstrip('\n') # keep the header for later
for line in csv_file:
tabs = line.rstrip('\n').split('\t')
# remove empty lines
if tabs[0].rstrip('\n') == "": continue
# if using Dnifont, convert the line to NTS
if args.dfont:
col1 = dfont2nts(tabs[0])
else:
col1 = tabs[0]
# each line is indentified by the item in the first column,
# stripped of any characters beside alphanumeric ones
in_item = re.sub('[^vbtsšjgyxkaáfpiíeérmþðdhoóçwuúclæzn]+', '', col1.lower())
# I need a separate index of the identifiers to use with sorted()
list_index.insert(len(list_index), in_item)
list_unord[in_item] = line.rstrip('\n')
csv_file.close()
# the lambda function sorts the array according to the sorting string,
# but halts if it finds a character not in the sorting string,
# which is why I had to strip it from non-alphabetic characters
list_ord = sorted(list_index, key=lambda word: [sortstring.index(c) for c in word])
# print the list to the standard output
print header
for item in list_ord:
print list_unord[item]