v4

JimVincentW · Sep 5, 2023 · e65a0f2 · e65a0f2
1 parent 4138250
commit e65a0f2
Showing 1 changed file with 109 additions and 40 deletions.
diff --git a/api_main.py b/api_main.py
@@ -37,15 +37,12 @@ def get_firefox_configuration():
 
     return options
 
-
-
 # Set OpenAI configuration
 def set_openai_config():
     openai.organization = os.getenv("OPENAI_ORGANIZATION")
     openai.api_key = os.getenv("OPENAI_API_KEY")
     check_model_availability()
 
-
 # Check if the desired model is available
 def check_model_availability():
     model_list = openai.Model.list()['data']
@@ -82,6 +79,7 @@ def extract_info(driver):
         'plenum': plenum
     }
 
+# Download a file from a given URL
 def download_file(url, date):
     # if folder not empty, delete all files
     if os.listdir('Drucksachen'):
@@ -110,16 +108,103 @@ def download_file(url, date):
 
 
 
-# Process each document file
+# # Process each document file
+# def process_documents():
+#     with open('fragenkatalog.json', 'r', encoding='utf-8') as file:
+#         fragenkatalog = json.load(file)
+
+#     document_files = [f for f in os.listdir('Drucksachen') if f.endswith('.pdf')]
+
+#     handler = StdOutCallbackHandler()
+#     llm = ChatOpenAI(temperature=0, model='gpt-4', streaming=True)
+
+#     template = ChatPromptTemplate.from_messages([
+#         ("system", "Du bist juristischer Referent des Bundestages."),
+#         ("human", "Bitte beantworte diesen Fragenkatalog zu dem angehängten Dokument in angemessener Knappheit. Um die Fragen zu beantworten arbeite bitte in Stichpunkten."),
+#         ("ai", "Alles klar, was sind die Fragen?"),
+#         ("human", "Die Fragen: {questions}. \n\nSei bitte so konkret wie möglich. Bei der Kritischen Perspektive zu der Rhetorik und benutzten sprachlichen Stilmitteln bitte die Begriffe und die Kritikpunkte daran kurz aufschreiben. "),
+#         ("ai", "Okay, was ist das Dokument?"),
+#         ("human", "Das Dokument: {document}")
+#         ,
+#     ])
+
+#     chain = LLMChain(llm=llm, prompt=template, callbacks=[handler])
+
+#     all_results = []
+#     for document_file in document_files:
+#         document_type, _ = os.path.splitext(document_file)
+#         questions = fragenkatalog['DokumentTypen'].get(document_type)
+#         if questions is None:
+#             print(f'No questions found for document type: {document_type}')
+#             continue
+#         questions_str = '\n'.join(questions)
+
+#         document_path = os.path.join('Drucksachen', document_file)
+#         with open(document_path, 'rb') as file:
+#             reader = PyPDF2.PdfReader(file)
+#             document_text = ''
+#             for page_num in range(len(list(reader.pages))):
+#                 page = reader.pages[page_num]
+#                 document_text += page.extract_text()
+
+#         result = chain.run({
+#             'document': document_text,
+#             'questions': questions_str
+#         })
+#         print(result)
+#         print("**********************")
+#         os.remove(document_path)
+#         all_results.append(json_result)  # Add the result to the list
+
+
+#         # Make a POST request to the OpenAI API's chat completions endpoint
+#         messages = [
+#                         {
+#                             'role': 'system',
+#                             'content': """
+#                         You are an expert at converting plain text data into a structured JSON format. The text you'll receive contains information about documents, questions about them, and their corresponding answers. Convert them into a structured JSON where each document is a separate entry. The keys for each document should be:
+#                         - "Document": for the document name.
+#                         - "Type": indicating the type of document.
+#                         - "Fragen": which will contain a list of questions.
+#                         - "Antworten": which will contain a list of answers corresponding to each question.
+#                         For example:
+#                         {
+#                             "Document": "Beschlussempfehlung.pdf",
+#                             "Type": "Fragenkatalog für: Beschlussempfehlung",
+#                             "Ergebnis": ["Frage1", "Antwort1", "Frage2", "Antwort2"]
+#                         }
+#                         Convert the following text into such a structured JSON format while keeping the order of the documents and questions intact and without any changes to the answers
+#                         """
+#                         },
+#                         {
+#                         'role': 'user',
+#                         'content': result
+#                         }
+#                      ]
+
+
+#         response = openai.ChatCompletion.create(
+#         model="gpt-3.5-turbo",
+#         messages=messages
+#     )
+
+#         # Append the result to the list
+#     all_results.append(response['choices'][0]['message']['content'])
+
+
+#     return all_results
+
 def process_documents():
+    # Read the question catalog from a JSON file
     with open('fragenkatalog.json', 'r', encoding='utf-8') as file:
         fragenkatalog = json.load(file)
 
+    # Get a list of all PDF files in the 'Drucksachen' directory
     document_files = [f for f in os.listdir('Drucksachen') if f.endswith('.pdf')]
 
+    # Initialize the ChatOpenAI instance and the prompt template
     handler = StdOutCallbackHandler()
     llm = ChatOpenAI(temperature=0, model='gpt-4', streaming=True)
-
     template = ChatPromptTemplate.from_messages([
         ("system", "Du bist juristischer Referent des Bundestages."),
         ("human", "Bitte beantworte diesen Fragenkatalog zu dem angehängten Dokument in angemessener Knappheit. Um die Fragen zu beantworten arbeite bitte in Stichpunkten."),
@@ -129,18 +214,22 @@ def process_documents():
         ("human", "Das Dokument: {document}")
         ,
     ])
-
     chain = LLMChain(llm=llm, prompt=template, callbacks=[handler])
 
+    # List to store all results
     all_results = []
+
+    # Process each document file
     for document_file in document_files:
+        # Get questions for the current document type
         document_type, _ = os.path.splitext(document_file)
         questions = fragenkatalog['DokumentTypen'].get(document_type)
         if questions is None:
             print(f'No questions found for document type: {document_type}')
             continue
         questions_str = '\n'.join(questions)
 
+        # Extract text from the PDF file
         document_path = os.path.join('Drucksachen', document_file)
         with open(document_path, 'rb') as file:
             reader = PyPDF2.PdfReader(file)
@@ -149,17 +238,15 @@ def process_documents():
                 page = reader.pages[page_num]
                 document_text += page.extract_text()
 
+        # Get the results using the ChatOpenAI chain
         result = chain.run({
             'document': document_text,
             'questions': questions_str
         })
-        print(result)
-        print("**********************")
         os.remove(document_path)
-        all_results.append(json_result)  # Add the result to the list
-
+        all_results.append(json_result)  # This line seems to be an error; 'json_result' is undefined
 
-        # Make a POST request to the OpenAI API's chat completions endpoint
+        # Create a structured JSON format using the OpenAI API
         messages = [
                         {
                             'role': 'system',
@@ -183,52 +270,34 @@ def process_documents():
                         'content': result
                         }
                      ]
-
-
         response = openai.ChatCompletion.create(
-        model="gpt-3.5-turbo",
-        messages=messages
-    )
-
-        # Append the result to the list
-    all_results.append(response['choices'][0]['message']['content'])
-
+            model="gpt-3.5-turbo",
+            messages=messages
+        )
+        all_results.append(response['choices'][0]['message']['content'])
 
     return all_results
 
-
 
 def process_url(url):
-    # You may want to return some meaningful results to the Flask API
     result_data = {}
-    
+
     options = get_firefox_configuration()
     service = FirefoxService(executable_path=GECKODRIVER_PATH, log_path=GECKODRIVER_LOG_PATH)
-
-    driver = webdriver.Firefox(service=service, options=options)
-
-    try:
+
+    with webdriver.Firefox(service=service, options=options) as driver:
         driver.get(url)
         driver.implicitly_wait(10)
         info = extract_info(driver)
-        
+
         for doc in info['wichtige_drucksachen']:
-            url = doc['link']
-            date = doc['date']
-            local_filename = download_file(url, date)
-            # You might want to include these in the result data to return to Flask
+            local_filename = download_file(doc['link'], doc['date'])
             result_data[local_filename] = f'Downloaded {local_filename}'
-        
+
         processed_data = process_documents()
         result_data['processed_data'] = processed_data
-        for idx, data in enumerate(processed_data):
-            result_data[f'processed_data_{idx}'] = data
-    finally:
-        driver.quit()
-
-    return result_data
-
 
+    return result_data
 
 
 # Main function