Skip to content

Commit

Permalink
Show last iteration time rather than average
Browse files Browse the repository at this point in the history
The average isn't very useful now since the small files at the start
throw it off and the estimated time is more important.
  • Loading branch information
john-shaffer committed Feb 3, 2024
1 parent 7346f70 commit 67b79e6
Showing 1 changed file with 6 additions and 4 deletions.
10 changes: 6 additions & 4 deletions stages/02_flatten-openalex-jsonl.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,11 +262,12 @@ def flatten_authors():
total_size_done += os.path.getsize(jsonl_file_name)
if FILES_PER_ENTITY and files_done >= FILES_PER_ENTITY:
break
total_time += time.time() - start_time
iter_time = time.time() - start_time
total_time += iter_time
avg_time = total_time / files_done
avg_time_per_byte = total_time / total_size_done
est_time = avg_time_per_byte * (total_size - total_size_done)
print(f"flatten_authors loop average time: {avg_time} seconds")
print(f"flatten_authors time: {iter_time} seconds")
print(f"flatten_authors estimated time remaining: {est_time / 60} minutes")

print(f"Flattening authors done in {total_time} seconds")
Expand Down Expand Up @@ -693,11 +694,12 @@ def flatten_works():
total_size_done += os.path.getsize(jsonl_file_name)
if FILES_PER_ENTITY and files_done >= FILES_PER_ENTITY:
break
total_time += time.time() - start_time
iter_time = time.time() - start_time
total_time += iter_time
avg_time = total_time / files_done
avg_time_per_byte = total_time / total_size_done
est_time = avg_time_per_byte * (total_size - total_size_done)
print(f"flatten_works loop average time: {avg_time} seconds")
print(f"flatten_works time: {iter_time} seconds")
print(f"flatten_works estimated time remaining: {est_time / 60} minutes")

print(f"Flattening works done in {total_time} seconds")
Expand Down

0 comments on commit 67b79e6

Please sign in to comment.