From 89e4f8bc61ae425400577ce569865dbfb227ec88 Mon Sep 17 00:00:00 2001 From: Marc Ubaldino Date: Tue, 4 Jun 2024 20:25:17 -0400 Subject: [PATCH] refactor and print statements on taxcat indexing --- dev.env | 2 +- solr/build.sh | 20 +++---------------- solr/script/taxcat_jrcnames.py | 13 +++++++----- ...tionalities.py => taxcat_nationalities.py} | 6 +++--- solr/script/taxcat_person_names.py | 6 +++++- solr/script/taxcat_wfb.py | 2 ++ 6 files changed, 22 insertions(+), 27 deletions(-) rename solr/script/{gaz_nationalities.py => taxcat_nationalities.py} (97%) diff --git a/dev.env b/dev.env index 4b815564..724df256 100644 --- a/dev.env +++ b/dev.env @@ -1,3 +1,3 @@ export XPONENTS=$PWD -export PYTHONPATH=$XPONENTS/python:$XPONENTS/piplib:$XPONENTS/solr/script +export PYTHONPATH=$XPONENTS/../piplib:$XPONENTS/solr/script export LOG4J_FORMAT_MSG_NO_LOOKUPS=true diff --git a/solr/build.sh b/solr/build.sh index 3b264596..669737b3 100755 --- a/solr/build.sh +++ b/solr/build.sh @@ -5,18 +5,15 @@ fi SOLR_PORT=${SOLR_PORT:-7000} SERVER=127.0.0.1:$SOLR_PORT -# Proxies can be sensitive, but at least we need NOPROXY -export noproxy=localhost,127.0.0.1 -export NO_PROXY=$noproxy cur=`dirname $0 ` XPONENTS=`cd -P $cur/..; echo $PWD` -export PYTHONPATH=$XPONENTS/python:$XPONENTS/piplib +export PYTHONPATH=$XPONENTS/../piplib GAZ_CONF=etc/gazetteer SOLR_CORE_VER=solr7 -if [ ! -d $XPONENTS/piplib ] ; then +if [ ! -d $XPONENTS/../piplib ] ; then echo "install python first" echo "see README" exit 1 @@ -31,7 +28,7 @@ index_taxcat () { TAXCAT=./etc/taxcat echo "Populate nationalities taxonomy in XTax" # you must set your PYTHONPATH to include required libraries built from ../python - python3 ./script/gaz_nationalities.py $TAXCAT/nationalities.csv --solr $SOLR_URL --starting-id 0 + python3 ./script/taxcat_nationalities.py $TAXCAT/nationalities.csv --solr $SOLR_URL --starting-id 0 sleep 1 echo "Populate core JRC Names 'entities' data file, c.2014" @@ -141,20 +138,9 @@ if [ ! -d $XPONENTS/target ]; then exit 1 fi -# Oh, shoot. I'm the only one who has a copy of this scrapped data. -do_wfb=0 - if [ $do_data -eq 1 ] ; then echo "Acquiring Census data files for names" ant gaz-resources - - if [ $do_wfb -eq 1 ]; then - echo "Harvesting World Factbook 'factoids'" - python3 ./script/assemble_wfb_leaders.py - python3 ./script/assemble_wfb_orgs.py - else - echo "As of 2020, WFB content has changed dramatically" - fi fi if [ $do_meta -eq 1 ] ; then diff --git a/solr/script/taxcat_jrcnames.py b/solr/script/taxcat_jrcnames.py index c5f3b761..4547f7e3 100644 --- a/solr/script/taxcat_jrcnames.py +++ b/solr/script/taxcat_jrcnames.py @@ -226,8 +226,6 @@ def create_entity(line, scan=False): """ global no_id_counter line = line.strip() - if not line: - return None parts = jrc_line_split.split(line, maxsplit=3) _id = parts[0] if _id == "0": @@ -328,7 +326,8 @@ def create_entity(line, scan=False): builder = TaxCatalogBuilder(server=args.solr, test=args.debug) if args.purge: builder.purge(catalog_id) - print("Pause"); sleep(30) + print("Pause") + sleep(10) if args.max: row_max = int(args.max) @@ -350,7 +349,8 @@ def create_entity(line, scan=False): row_id = 0 with open(args.taxonomy, "r", encoding="UTF-8") as fh: for row in fh: - if row.startswith("#") or len(row.strip()) == 0: continue + if row.startswith("#") or not row.strip(): + continue row_id = row_id + 1 create_entity(row, scan=True) @@ -370,7 +370,8 @@ def create_entity(line, scan=False): row_id = 0 with open(args.taxonomy, "r", encoding="UTF-8") as fh: for row in fh: - if row.startswith("#") or len(row.strip()) == 0: continue + if row.startswith("#") or not row.strip(): + continue row_id = row_id + 1 create_entity(row, scan=True) @@ -410,3 +411,5 @@ def create_entity(line, scan=False): builder.save(flush=True) builder.optimize() + print("Start row:", start_id) + print("Final row:", start_id + builder.count) diff --git a/solr/script/gaz_nationalities.py b/solr/script/taxcat_nationalities.py similarity index 97% rename from solr/script/gaz_nationalities.py rename to solr/script/taxcat_nationalities.py index 5fbf0c3e..c7a799c7 100644 --- a/solr/script/gaz_nationalities.py +++ b/solr/script/taxcat_nationalities.py @@ -72,9 +72,9 @@ def create_entities(line): Create a taxon entry for this nationality, which may have diacritics. All phrases are unicode. """ if not line: return [] - parts = line.split(',') - name = get_text(parts[0]).strip() - cc = parts[1].strip().upper() + parts = line.strip().split(',') + name = get_text(parts[0]) + cc = parts[1].upper() # # done with aliasing. diff --git a/solr/script/taxcat_person_names.py b/solr/script/taxcat_person_names.py index 78ebc237..a25b819d 100644 --- a/solr/script/taxcat_person_names.py +++ b/solr/script/taxcat_person_names.py @@ -66,8 +66,12 @@ def index_names(taxcat, fpath, cat, tag, rownum): test = False row_max = -1 builder = TaxCatalogBuilder(server=args.solr) - row_id = int(args.starting_id) + start_id = int(args.starting_id) + row_id = start_id builder.server.delete(q=f"catalog:{catalog_id}", commit=True) for cfg in data_sets: row_id = index_names(builder, cfg['path'], catalog_id, cfg['source'], row_id) + print("Start row:", start_id) + print("Final row:", start_id + builder.count) + diff --git a/solr/script/taxcat_wfb.py b/solr/script/taxcat_wfb.py index d05e1b1d..f90f9d45 100644 --- a/solr/script/taxcat_wfb.py +++ b/solr/script/taxcat_wfb.py @@ -210,6 +210,8 @@ def main_loop(url): builder.save(flush=True) builder.optimize() + print("Start row:", start_id) + print("Final row:", start_id + builder.count) if __name__ == '__main__':