Skip to content

Commit

Permalink
CC-11: repair and improve blob counting metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
jblowe committed Mar 19, 2019
1 parent 5ffb361 commit 52747e2
Show file tree
Hide file tree
Showing 4 changed files with 23 additions and 21 deletions.
6 changes: 3 additions & 3 deletions datasources/bampfa/solrETL-internal.sh
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,9 @@ time curl -X POST -S -s "http://localhost:8983/solr/${TENANT}-internal/update/cs
time python evaluate.py 4solr.$TENANT.internal.csv /dev/null > counts.internal.csv &
# get rid of intermediate files
rm d?.csv m?.csv b?.csv media.csv metadata.csv &
cut -f43 4solr.${TENANT}.public.csv | grep -v 'blob_ss' |perl -pe 's/\r//' | grep . | wc -l > counts.internal.blobs.csv
cut -f43 4solr.${TENANT}.public.csv | perl -pe 's/\r//;s/,/\n/g;s/\|/\n/g;' | grep -v 'blob_ss' | grep . | wc -l >> counts.internal.blobs.csv
cp counts.internal.blobs.csv /tmp/$TENANT.counts.internal.csv
cut -f43 4solr.${TENANT}.internal.csv | grep -v 'blob_ss' |perl -pe 's/\r//' | grep . | wc -l > counts.internal.blobs.csv
cut -f43 4solr.${TENANT}.internal.csv | perl -pe 's/\r//;s/,/\n/g;s/\|/\n/g;' | grep -v 'blob_ss' | grep . | wc -l >> counts.internal.blobs.csv
cp counts.internal.blobs.csv /tmp/$TENANT.counts.internal.blobs.csv
cat counts.internal.blobs.csv
wait
# zip up .csvs, save a bit of space on backups
Expand Down
2 changes: 1 addition & 1 deletion datasources/bampfa/solrETL-public.sh
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ time python evaluate.py 4solr.$TENANT.public.csv /dev/null > counts.public.csv &
rm d?.csv m?.csv b?.csv media.csv metadata.csv &
cut -f43 4solr.${TENANT}.public.csv | grep -v 'blob_ss' |perl -pe 's/\r//' | grep . | wc -l > counts.public.blobs.csv
cut -f43 4solr.${TENANT}.public.csv | perl -pe 's/\r//;s/,/\n/g;s/\|/\n/g;' | grep -v 'blob_ss' | grep . | wc -l >> counts.public.blobs.csv
cp counts.public.blobs.csv /tmp/$TENANT.counts.public.csv
cp counts.public.blobs.csv /tmp/$TENANT.counts.public.blobs.csv
cat counts.public.blobs.csv
wait
cp counts.public.csv /tmp/$TENANT.counts.public.csv
Expand Down
32 changes: 17 additions & 15 deletions datasources/pahma/solrETL-locations.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,13 @@ export NUMCOLS=36
##############################################################################
# extract locations, past and present, from CSpace
##############################################################################
time psql -F $'\t' -R"@@" -A -U $USERNAME -d "$CONNECTSTRING" -f locations1.sql -o m1.csv
time psql -F $'\t' -R"@@" -A -U $USERNAME -d "$CONNECTSTRING" -f locations2.sql -o m2.csv
time psql -F $'\t' -R"@@" -A -U $USERNAME -d "$CONNECTSTRING" -f locations1.sql -o m1.csv &
time psql -F $'\t' -R"@@" -A -U $USERNAME -d "$CONNECTSTRING" -f locations2.sql -o m2.csv &
wait
# cleanup newlines and crlf in data, then switch record separator.
time perl -pe 's/[\r\n]/ /g;s/\@\@/\n/g' m1.csv > m1a.csv
time perl -pe 's/[\r\n]/ /g;s/\@\@/\n/g' m2.csv > m2a.csv
time perl -pe 's/[\r\n]/ /g;s/\@\@/\n/g' m1.csv > m1a.csv &
time perl -pe 's/[\r\n]/ /g;s/\@\@/\n/g' m2.csv > m2a.csv &
wait
rm m1.csv m2.csv
##############################################################################
# stitch the two files together
Expand All @@ -36,15 +38,19 @@ wait
rm m1a.csv m2a.csv
time join -j 1 -t $'\t' m1a.sort.csv m2a.sort.csv > m3.sort.csv
rm m1a.sort.csv m2a.sort.csv
cut -f1-5,10-14 m3.sort.csv > m4.csv
cut -f1-5,7- m3.sort.csv > m4.csv
##############################################################################
# we want to recover and use our "special" solr-friendly header, which got buried
##############################################################################
grep -P "^csid_s\t" m4.csv > header4Solr.csv
grep -v -P "^csid_s\t" m4.csv > m5.csv
grep -P "^csid_s\t" m4.csv > header4Solr.csv &
grep -v -P "^csid_s\t" m4.csv > m5.csv &
wait
cat header4Solr.csv m5.csv > m4.csv
rm m5.csv m3.sort.csv
time perl -ne " \$x = \$_ ;s/[^\t]//g; if (length eq 8) { print \$x;}" m4.csv > 4solr.${TENANT}.locations.csv
##############################################################################
# count the types and tokens in the final file
##############################################################################
time python evaluate.py m4.csv 4solr.${TENANT}.locations.csv > counts.locations.csv
##############################################################################
# ok, now let's load this into solr...
# clear out the existing data
Expand All @@ -56,14 +62,10 @@ curl -S -s "http://localhost:8983/solr/${TENANT}-locations/update" --data '<comm
# note, among other things, the overriding of the encapsulator with \
##############################################################################
time curl -X POST -s -S 'http://localhost:8983/solr/pahma-locations/update/csv?commit=true&header=true&trim=true&separator=%09&encapsulator=\' -T 4solr.pahma.locations.csv -H 'Content-type:text/plain; charset=utf-8' &
##############################################################################
# count the types and tokens in the final file
##############################################################################
time python evaluate.py 4solr.$TENANT.locations.csv /dev/null > counts.locations.csv &
# count blobs
cut -f67 4solr.${TENANT}.public.csv | grep -v 'blob_ss' |perl -pe 's/\r//' | grep . | wc -l > counts.public.blobs.csv
cut -f67 4solr.${TENANT}.public.csv | perl -pe 's/\r//;s/,/\n/g' | grep -v 'blob_ss' | grep . | wc -l >> counts.public.blobs.csv
cp counts.public.blobs.csv /tmp/$TENANT.counts.public.csv
cut -f67 4solr.${TENANT}.locations.csv | grep -v 'blob_ss' |perl -pe 's/\r//' | grep . | wc -l > counts.locations.blobs.csv
cut -f67 4solr.${TENANT}.locations.csv | perl -pe 's/\r//;s/,/\n/g' | grep -v 'blob_ss' | grep . | wc -l >> counts.locations.blobs.csv
cp counts.locations.blobs.csv /tmp/$TENANT.counts.locations.csv
# get rid of intermediate files
rm m4.csv
wait
Expand Down
4 changes: 2 additions & 2 deletions datasources/pahma/solrETL-osteology.sh
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,8 @@ curl -S -s "http://localhost:8983/solr/${TENANT}-osteology/update" --data '<comm
time curl -X POST -S -s "http://localhost:8983/solr/${TENANT}-osteology/update/csv?commit=true&header=true&separator=%09&f.taxon_ss.split=true&f.taxon_ss.separator=%7C&f.objculturedepicted_ss.split=true&f.objculturedepicted_sss.separator=%7C&f.objplacedepicted_ss.split=true&f.objplacedepicted_ss.separator=%7C&f.objpersondepicted_ss.split=true&f.objpersondepicted_ss.separator=%7C&f.status_ss.split=true&f.status_ss.separator=%7C&f.audio_md5_ss.split=true&f.audio_md5_ss.separator=%7C&f.blob_md5_ss.split=true&f.blob_md5_ss.separator=%7C&f.card_md5_ss.split=true&f.card_md5_ss.separator=%7C&f.x3d_md5_ss.split=true&f.x3d_md5_ss.separator=%7C&f.x3d_csid_ss.split=true&f.x3d_csid_ss.separator=%7C&f.video_md5_ss.split=true&f.video_md5_ss.separator=%7C&f.aggregate_ss.split=true&f.aggregate_ss.separator=%2C&f.objpp_ss.split=true&f.objpp_ss.separator=%7C&f.anonymousdonor_ss.split=true&f.anonymousdonor_ss.separator=%7C&f.objaltnum_ss.split=true&f.objaltnum_ss.separator=%7C&f.objfilecode_ss.split=true&f.objfilecode_ss.separator=%7C&f.objdimensions_ss.split=true&f.objdimensions_ss.separator=%7C&f.objmaterials_ss.split=true&f.objmaterials_ss.separator=%7C&f.objinscrtext_ss.split=true&f.objinscrtext_ss.separator=%7C&f.objcollector_ss.split=true&f.objcollector_ss.separator=%7C&f.objaccno_ss.split=true&f.objaccno_ss.separator=%7C&f.objaccdate_ss.split=true&f.objaccdate_ss.separator=%7C&f.objacqdate_ss.split=true&f.objacqdate_ss.separator=%7C&f.objassoccult_ss.split=true&f.objassoccult_ss.separator=%7C&f.objculturetree_ss.split=true&f.objculturetree_ss.separator=%7C&f.objfcptree_ss.split=true&f.objfcptree_ss.separator=%7C&f.grouptitle_ss.split=true&f.grouptitle_ss.separator=%7C&f.objmaker_ss.split=true&f.objmaker_ss.separator=%7C&f.objaccdate_begin_dts.split=true&f.objaccdate_begin_dts.separator=%7C&f.objacqdate_begin_dts.split=true&f.objacqdate_begin_dts.separator=%7C&f.objaccdate_end_dts.split=true&f.objaccdate_end_dts.separator=%7C&f.objacqdate_end_dts.split=true&f.objacqdate_end_dts.separator=%7C&f.blob_ss.split=true&f.blob_ss.separator=%7C&f.card_ss.split=true&f.card_ss.separator=%7C&f.imagetype_ss.split=true&f.imagetype_ss.separator=%7C&encapsulator=\\" -T 4solr.${TENANT}.osteology.csv -H 'Content-type:text/plain; charset=utf-8' &
rm o?.csv header4Solr.csv
# count blobs
cut -f78 4solr.${TENANT}.public.csv | grep -v 'blob_ss' |perl -pe 's/\r//' | grep . | wc -l > counts.osteology.blobs.csv
cut -f78 4solr.${TENANT}.public.csv | perl -pe 's/\r//;s/,/\n/g;s/\|/\n/g;' | grep -v 'blob_ss' | grep . | wc -l >> counts.osteology.blobs.csv &
cut -f78 4solr.${TENANT}.osteology.csv | grep -v 'blob_ss' |perl -pe 's/\r//' | grep . | wc -l > counts.osteology.blobs.csv
cut -f78 4solr.${TENANT}.osteology.csv | perl -pe 's/\r//;s/,/\n/g;s/\|/\n/g;' | grep -v 'blob_ss' | grep . | wc -l >> counts.osteology.blobs.csv &
wait
cp counts.osteology.blobs.csv /tmp/$TENANT.counts.osteology.blobs.csv
cat counts.osteology.blobs.csv
Expand Down

0 comments on commit 52747e2

Please sign in to comment.