You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
import os
from llmware.library import Library
from llmware.retrieval import Query
def extract_pdf_tables(library_name):
print(f"\nExample: Parsing PDF Documents and Extracting Tables")
# Step 1 - create library
lib = Library().create_new_library(library_name)
# Step 2 - pull sample files
sample_files_path = "./pdfs"
# Step 3 - parse and extract all of the content from the PDF Documents
parsing_output = lib.add_files(input_folder_path=sample_files_path)
# Review the parsing output summary info - all of the text and table blocks are in Mongo collection
print("Update: parsing_output - ", parsing_output)
# Step 4 - export all of the content into .jsonl files with metadata
output_fp = "./output_csv"
print(f"Update: Step 4 - exporting all blocks into file path - {output_fp}")
output1 = lib.export_library_to_jsonl_file(output_fp, f"{library_name}_export")
# Step 5 - export all of the tables as csv with ''" in the query
print(f"Update: Step 5 - exporting all tables with into file path - {output_fp}")
output2 = Query(lib).export_all_tables(query="Topline", output_fp=output_fp)
return output2
if __name__ == "__main__":
extract_pdf_tables("pdf_table_lib_example")
I have tried different query values ["politicians ", "elections", "Response"].
I have attached pdf and json file.
survey.pdf
pdf_table_lib_example_export.json
Thanks.
The text was updated successfully, but these errors were encountered: