Skip to content

Commit

Permalink
refactor and print statements on taxcat indexing
Browse files Browse the repository at this point in the history
  • Loading branch information
mubaldino committed Jun 5, 2024
1 parent f528581 commit 89e4f8b
Show file tree
Hide file tree
Showing 6 changed files with 22 additions and 27 deletions.
2 changes: 1 addition & 1 deletion dev.env
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
export XPONENTS=$PWD
export PYTHONPATH=$XPONENTS/python:$XPONENTS/piplib:$XPONENTS/solr/script
export PYTHONPATH=$XPONENTS/../piplib:$XPONENTS/solr/script
export LOG4J_FORMAT_MSG_NO_LOOKUPS=true
20 changes: 3 additions & 17 deletions solr/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,15 @@ fi

SOLR_PORT=${SOLR_PORT:-7000}
SERVER=127.0.0.1:$SOLR_PORT
# Proxies can be sensitive, but at least we need NOPROXY
export noproxy=localhost,127.0.0.1
export NO_PROXY=$noproxy

cur=`dirname $0 `
XPONENTS=`cd -P $cur/..; echo $PWD`

export PYTHONPATH=$XPONENTS/python:$XPONENTS/piplib
export PYTHONPATH=$XPONENTS/../piplib
GAZ_CONF=etc/gazetteer
SOLR_CORE_VER=solr7

if [ ! -d $XPONENTS/piplib ] ; then
if [ ! -d $XPONENTS/../piplib ] ; then
echo "install python first"
echo "see README"
exit 1
Expand All @@ -31,7 +28,7 @@ index_taxcat () {
TAXCAT=./etc/taxcat
echo "Populate nationalities taxonomy in XTax"
# you must set your PYTHONPATH to include required libraries built from ../python
python3 ./script/gaz_nationalities.py $TAXCAT/nationalities.csv --solr $SOLR_URL --starting-id 0
python3 ./script/taxcat_nationalities.py $TAXCAT/nationalities.csv --solr $SOLR_URL --starting-id 0
sleep 1

echo "Populate core JRC Names 'entities' data file, c.2014"
Expand Down Expand Up @@ -141,20 +138,9 @@ if [ ! -d $XPONENTS/target ]; then
exit 1
fi

# Oh, shoot. I'm the only one who has a copy of this scrapped data.
do_wfb=0

if [ $do_data -eq 1 ] ; then
echo "Acquiring Census data files for names"
ant gaz-resources

if [ $do_wfb -eq 1 ]; then
echo "Harvesting World Factbook 'factoids'"
python3 ./script/assemble_wfb_leaders.py
python3 ./script/assemble_wfb_orgs.py
else
echo "As of 2020, WFB content has changed dramatically"
fi
fi

if [ $do_meta -eq 1 ] ; then
Expand Down
13 changes: 8 additions & 5 deletions solr/script/taxcat_jrcnames.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,8 +226,6 @@ def create_entity(line, scan=False):
"""
global no_id_counter
line = line.strip()
if not line:
return None
parts = jrc_line_split.split(line, maxsplit=3)
_id = parts[0]
if _id == "0":
Expand Down Expand Up @@ -328,7 +326,8 @@ def create_entity(line, scan=False):
builder = TaxCatalogBuilder(server=args.solr, test=args.debug)
if args.purge:
builder.purge(catalog_id)
print("Pause"); sleep(30)
print("Pause")
sleep(10)

if args.max:
row_max = int(args.max)
Expand All @@ -350,7 +349,8 @@ def create_entity(line, scan=False):
row_id = 0
with open(args.taxonomy, "r", encoding="UTF-8") as fh:
for row in fh:
if row.startswith("#") or len(row.strip()) == 0: continue
if row.startswith("#") or not row.strip():
continue

row_id = row_id + 1
create_entity(row, scan=True)
Expand All @@ -370,7 +370,8 @@ def create_entity(line, scan=False):
row_id = 0
with open(args.taxonomy, "r", encoding="UTF-8") as fh:
for row in fh:
if row.startswith("#") or len(row.strip()) == 0: continue
if row.startswith("#") or not row.strip():
continue

row_id = row_id + 1
create_entity(row, scan=True)
Expand Down Expand Up @@ -410,3 +411,5 @@ def create_entity(line, scan=False):

builder.save(flush=True)
builder.optimize()
print("Start row:", start_id)
print("Final row:", start_id + builder.count)
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,9 @@ def create_entities(line):
Create a taxon entry for this nationality, which may have diacritics. All phrases are unicode.
"""
if not line: return []
parts = line.split(',')
name = get_text(parts[0]).strip()
cc = parts[1].strip().upper()
parts = line.strip().split(',')
name = get_text(parts[0])
cc = parts[1].upper()

#
# done with aliasing.
Expand Down
6 changes: 5 additions & 1 deletion solr/script/taxcat_person_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,12 @@ def index_names(taxcat, fpath, cat, tag, rownum):
test = False
row_max = -1
builder = TaxCatalogBuilder(server=args.solr)
row_id = int(args.starting_id)
start_id = int(args.starting_id)
row_id = start_id
builder.server.delete(q=f"catalog:{catalog_id}", commit=True)

for cfg in data_sets:
row_id = index_names(builder, cfg['path'], catalog_id, cfg['source'], row_id)
print("Start row:", start_id)
print("Final row:", start_id + builder.count)

2 changes: 2 additions & 0 deletions solr/script/taxcat_wfb.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,8 @@ def main_loop(url):

builder.save(flush=True)
builder.optimize()
print("Start row:", start_id)
print("Final row:", start_id + builder.count)


if __name__ == '__main__':
Expand Down

0 comments on commit 89e4f8b

Please sign in to comment.