Skip to content

Commit

Permalink
v4
Browse files Browse the repository at this point in the history
  • Loading branch information
JimVincentW committed Sep 5, 2023
1 parent 4138250 commit e65a0f2
Showing 1 changed file with 109 additions and 40 deletions.
149 changes: 109 additions & 40 deletions api_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,15 +37,12 @@ def get_firefox_configuration():

return options



# Set OpenAI configuration
def set_openai_config():
openai.organization = os.getenv("OPENAI_ORGANIZATION")
openai.api_key = os.getenv("OPENAI_API_KEY")
check_model_availability()


# Check if the desired model is available
def check_model_availability():
model_list = openai.Model.list()['data']
Expand Down Expand Up @@ -82,6 +79,7 @@ def extract_info(driver):
'plenum': plenum
}

# Download a file from a given URL
def download_file(url, date):
# if folder not empty, delete all files
if os.listdir('Drucksachen'):
Expand Down Expand Up @@ -110,16 +108,103 @@ def download_file(url, date):



# Process each document file
# # Process each document file
# def process_documents():
# with open('fragenkatalog.json', 'r', encoding='utf-8') as file:
# fragenkatalog = json.load(file)

# document_files = [f for f in os.listdir('Drucksachen') if f.endswith('.pdf')]

# handler = StdOutCallbackHandler()
# llm = ChatOpenAI(temperature=0, model='gpt-4', streaming=True)

# template = ChatPromptTemplate.from_messages([
# ("system", "Du bist juristischer Referent des Bundestages."),
# ("human", "Bitte beantworte diesen Fragenkatalog zu dem angehängten Dokument in angemessener Knappheit. Um die Fragen zu beantworten arbeite bitte in Stichpunkten."),
# ("ai", "Alles klar, was sind die Fragen?"),
# ("human", "Die Fragen: {questions}. \n\nSei bitte so konkret wie möglich. Bei der Kritischen Perspektive zu der Rhetorik und benutzten sprachlichen Stilmitteln bitte die Begriffe und die Kritikpunkte daran kurz aufschreiben. "),
# ("ai", "Okay, was ist das Dokument?"),
# ("human", "Das Dokument: {document}")
# ,
# ])

# chain = LLMChain(llm=llm, prompt=template, callbacks=[handler])

# all_results = []
# for document_file in document_files:
# document_type, _ = os.path.splitext(document_file)
# questions = fragenkatalog['DokumentTypen'].get(document_type)
# if questions is None:
# print(f'No questions found for document type: {document_type}')
# continue
# questions_str = '\n'.join(questions)

# document_path = os.path.join('Drucksachen', document_file)
# with open(document_path, 'rb') as file:
# reader = PyPDF2.PdfReader(file)
# document_text = ''
# for page_num in range(len(list(reader.pages))):
# page = reader.pages[page_num]
# document_text += page.extract_text()

# result = chain.run({
# 'document': document_text,
# 'questions': questions_str
# })
# print(result)
# print("**********************")
# os.remove(document_path)
# all_results.append(json_result) # Add the result to the list


# # Make a POST request to the OpenAI API's chat completions endpoint
# messages = [
# {
# 'role': 'system',
# 'content': """
# You are an expert at converting plain text data into a structured JSON format. The text you'll receive contains information about documents, questions about them, and their corresponding answers. Convert them into a structured JSON where each document is a separate entry. The keys for each document should be:
# - "Document": for the document name.
# - "Type": indicating the type of document.
# - "Fragen": which will contain a list of questions.
# - "Antworten": which will contain a list of answers corresponding to each question.
# For example:
# {
# "Document": "Beschlussempfehlung.pdf",
# "Type": "Fragenkatalog für: Beschlussempfehlung",
# "Ergebnis": ["Frage1", "Antwort1", "Frage2", "Antwort2"]
# }
# Convert the following text into such a structured JSON format while keeping the order of the documents and questions intact and without any changes to the answers
# """
# },
# {
# 'role': 'user',
# 'content': result
# }
# ]


# response = openai.ChatCompletion.create(
# model="gpt-3.5-turbo",
# messages=messages
# )

# # Append the result to the list
# all_results.append(response['choices'][0]['message']['content'])


# return all_results

def process_documents():
# Read the question catalog from a JSON file
with open('fragenkatalog.json', 'r', encoding='utf-8') as file:
fragenkatalog = json.load(file)

# Get a list of all PDF files in the 'Drucksachen' directory
document_files = [f for f in os.listdir('Drucksachen') if f.endswith('.pdf')]

# Initialize the ChatOpenAI instance and the prompt template
handler = StdOutCallbackHandler()
llm = ChatOpenAI(temperature=0, model='gpt-4', streaming=True)

template = ChatPromptTemplate.from_messages([
("system", "Du bist juristischer Referent des Bundestages."),
("human", "Bitte beantworte diesen Fragenkatalog zu dem angehängten Dokument in angemessener Knappheit. Um die Fragen zu beantworten arbeite bitte in Stichpunkten."),
Expand All @@ -129,18 +214,22 @@ def process_documents():
("human", "Das Dokument: {document}")
,
])

chain = LLMChain(llm=llm, prompt=template, callbacks=[handler])

# List to store all results
all_results = []

# Process each document file
for document_file in document_files:
# Get questions for the current document type
document_type, _ = os.path.splitext(document_file)
questions = fragenkatalog['DokumentTypen'].get(document_type)
if questions is None:
print(f'No questions found for document type: {document_type}')
continue
questions_str = '\n'.join(questions)

# Extract text from the PDF file
document_path = os.path.join('Drucksachen', document_file)
with open(document_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
Expand All @@ -149,17 +238,15 @@ def process_documents():
page = reader.pages[page_num]
document_text += page.extract_text()

# Get the results using the ChatOpenAI chain
result = chain.run({
'document': document_text,
'questions': questions_str
})
print(result)
print("**********************")
os.remove(document_path)
all_results.append(json_result) # Add the result to the list

all_results.append(json_result) # This line seems to be an error; 'json_result' is undefined

# Make a POST request to the OpenAI API's chat completions endpoint
# Create a structured JSON format using the OpenAI API
messages = [
{
'role': 'system',
Expand All @@ -183,52 +270,34 @@ def process_documents():
'content': result
}
]


response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=messages
)

# Append the result to the list
all_results.append(response['choices'][0]['message']['content'])

model="gpt-3.5-turbo",
messages=messages
)
all_results.append(response['choices'][0]['message']['content'])

return all_results



def process_url(url):
# You may want to return some meaningful results to the Flask API
result_data = {}

options = get_firefox_configuration()
service = FirefoxService(executable_path=GECKODRIVER_PATH, log_path=GECKODRIVER_LOG_PATH)

driver = webdriver.Firefox(service=service, options=options)

try:

with webdriver.Firefox(service=service, options=options) as driver:
driver.get(url)
driver.implicitly_wait(10)
info = extract_info(driver)

for doc in info['wichtige_drucksachen']:
url = doc['link']
date = doc['date']
local_filename = download_file(url, date)
# You might want to include these in the result data to return to Flask
local_filename = download_file(doc['link'], doc['date'])
result_data[local_filename] = f'Downloaded {local_filename}'

processed_data = process_documents()
result_data['processed_data'] = processed_data
for idx, data in enumerate(processed_data):
result_data[f'processed_data_{idx}'] = data
finally:
driver.quit()

return result_data


return result_data


# Main function
Expand Down

0 comments on commit e65a0f2

Please sign in to comment.