Skip to content

Commit

Permalink
Merge pull request #60 from RobsOnWaves/adding_some_stats
Browse files Browse the repository at this point in the history
Adding some stats
  • Loading branch information
RobsOnWaves authored Jan 28, 2024
2 parents 78c03d9 + 1757ab6 commit ba3d27f
Show file tree
Hide file tree
Showing 4 changed files with 186 additions and 5 deletions.
85 changes: 84 additions & 1 deletion Code/libs/meps_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@
from unidecode import unidecode
import shutil
import tempfile
import nltk
from nltk.corpus import stopwords
from collections import Counter
import random
import string


class MepsHandler:
Expand All @@ -26,7 +31,7 @@ def __init__(self):
self.__collection_name__ = "meps_meetings"

class TimeoutException(Exception):
pass
pass

def timeout_handler(self):
raise self.TimeoutException()
Expand Down Expand Up @@ -64,3 +69,81 @@ def get_mep_db_name(self):

def get_mep_collection_name(self):
return self.__collection_name__

def get_stats(self, df: pd.DataFrame):
stats = {}
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stop_words.update(set(stopwords.words('french')))
stop_words.update(set(stopwords.words('spanish')))
stop_words.update(set(stopwords.words('italian')))
stop_words.update(set(stopwords.words('portuguese')))
stop_words.update(set(stopwords.words('german')))
stop_words.update(set(stopwords.words('dutch')))
stop_words.update(set(stopwords.words('russian')))
stop_words.update(set(stopwords.words('finnish')))
stop_words.update(set(stopwords.words('danish')))
stop_words.update(["meeting", "staff", "directive"])

df = df.replace(to_replace='[-/&()]', value=' ', regex=True)
df["Meeting Related to Procedure"] = df["Meeting Related to Procedure"].replace(to_replace=pd.NA,
value='Not related to a procedure'
)

def convert_numbers_to_string(x):
if isinstance(x, float):
return str(x)
if isinstance(x, int):
return str(x)
return x

def count_words(text_series):
all_words = []
for line in text_series:
words = line.lower().split()
words = [word.strip('.,;!') for word in words if word.lower() not in stop_words]
all_words.extend(words)
return Counter(all_words)

df = df.map(convert_numbers_to_string)
regex_pattern = r'(?i)exchange of views|general exchange of views'

for column in ["Title", "Meeting With"]:
occurrences_counter = count_words(df[~df[column].str.contains(regex_pattern, regex=True)][column])
stats[column] = Counter(
{k: v for k, v in sorted(occurrences_counter.items(), key=lambda item: item[1], reverse=True)})
for column in ["MEP Name",
"MEP nationalPoliticalGroup",
"MEP politicalGroup",
"Place",
"Meeting Related to Procedure",
"Title",
"Meeting With"]:
occurrences_counter = Counter(df[~df[column].str.contains(regex_pattern, regex=True)][column])
occurrences_counter_raw = Counter(df[column])
if column not in ["Title", "Meeting With"]:
stats[column] = Counter(
{k: v for k, v in sorted(occurrences_counter.items(), key=lambda item: item[1], reverse=True)})
else:
stats["Title_no_stopwords" if column == "Title" else "Meeting_With_no_stopwords"] = Counter(
{k: v for k, v in sorted(occurrences_counter.items(), key=lambda item: item[1], reverse=True)})
stats["Title_unfiltered" if column == "Title" else "Meeting_With_unfiltered"] = Counter(
{k: v for k, v in sorted(occurrences_counter_raw.items(), key=lambda item: item[1], reverse=True)})

return stats

def get_stats_file(self, data: dict):

taille = 50
caracteres_possibles = string.ascii_letters + string.digits # Inclut les lettres et les chiffres
random_string = ''.join(random.choices(caracteres_possibles, k=taille))

filename = 'filename' + random_string + '.xlsx'

with pd.ExcelWriter(filename) as writer:
for key, counter in data.items():
# Conversion du Counter en DataFrame
df = pd.DataFrame(list(counter.items()), columns=['Item', 'Count'])
# Écriture du DataFrame dans une feuille Excel
df.to_excel(writer, sheet_name=key, index=False)
return filename
21 changes: 19 additions & 2 deletions Code/libs/mongo_db_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,13 +320,30 @@ def get_unique_values(self, db_name: str, collection_name: str, fields: list):
if not isinstance(val, str):
val = str(val)
# Tronquer la chaîne si elle dépasse 50 caractères et ajouter "..."
if len(val) > 50:
val = val[:50] + "..."
valeurs_conformes.append(val)

valeurs_dedupliquees[field] = valeurs_conformes


return valeurs_dedupliquees

def get_df(self, db_name: str, collection_name: str, query: dict):

client = self.__mongo_client__
db = client[db_name]
collection = db[collection_name]

try:
data = list(collection.find(query, {'_id': False}))
df = pd.DataFrame(data)
if 'Date' in df.columns:
df['Date'] = df['Date'].dt.strftime('%d/%m/%Y')
else:
print('No Date column')

return df

except Exception as e:
print("Exception in getting meps documents in Mongo" + str(e))
return {"ged_insert_status": "Exception in getting meps documents in Mongo" + str(e)}

78 changes: 78 additions & 0 deletions Code/public_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -579,6 +579,84 @@ async def get_meps_file_selected_fields(current_user: User = Depends(get_current
raise HTTPException(status_code=403, detail=messages.denied_entry)


@app.get("/meps_stats",
description="get meps stats")
async def get_meps_stats(mep_name: Optional[str] = None,
national_political_group: Optional[str] = None,
political_group: Optional[str] = None,
title: Optional[str] = None,
place: Optional[str] = None,
meeting_with: Optional[str] = None,
start_date: Optional[datetime] = None,
end_date: Optional[datetime] = None,
current_user: User = Depends(get_current_active_user)):
if current_user.role in ['admin', 'meps']:
db_name = meps_handler.get_mep_db_name()
collection_name = meps_handler.get_mep_collection_name()

def wild_card(word_to_search: str) :
word_to_search = re.escape(word_to_search)
return {"$regex": ".*" + word_to_search + ".*", "$options": "i"}

query = {
'MEP Name': wild_card(mep_name) if mep_name is not None else wild_card(''),
'MEP nationalPoliticalGroup': wild_card(national_political_group) if national_political_group is not None else wild_card(''),
'MEP politicalGroup': wild_card(political_group) if political_group is not None else wild_card(''),
'Title': wild_card(title) if title is not None else wild_card(''),
'Place': wild_card(place) if place is not None else wild_card(''),
'Meeting With': wild_card(meeting_with) if meeting_with is not None else wild_card('')
}

if start_date and end_date:
query['Date'] = {"$gte": start_date, "$lte": end_date}
elif start_date:
query['Date'] = {"$gte": start_date}
elif end_date:
query['Date'] = {"$lte": end_date}

try:
df = mongo_handler.get_df(db_name=db_name, collection_name=collection_name, query=query)

return meps_handler.get_stats(df)

except Exception as e:
print("get_meps_stats : " + str(e), flush=True)
raise HTTPException(status_code=404, detail=messages.nok_string_raw)
else:
raise HTTPException(status_code=403, detail=messages.denied_entry)


@app.get("/meps_stats_file",
description="get meps stats file")
async def get_meps_stats_file(mep_name: Optional[str] = None,
national_political_group: Optional[str] = None,
political_group: Optional[str] = None,
title: Optional[str] = None,
place: Optional[str] = None,
meeting_with: Optional[str] = None,
start_date: Optional[datetime] = None,
end_date: Optional[datetime] = None,
current_user: User = Depends(get_current_active_user)):
if current_user.role in ['admin', 'meps']:
try:
data = await get_meps_stats(mep_name,
national_political_group,
political_group,
title,
place,
meeting_with,
start_date,
end_date,
current_user)

return FileResponse(meps_handler.get_stats_file(data))

except Exception as e:
print("get_meps_stats_file : " + str(e), flush=True)
raise HTTPException(status_code=404, detail=messages.nok_string_raw)
else:
raise HTTPException(status_code=403, detail=messages.denied_entry)

@app.post("/logout")
async def logout():
return {"message": "Disconnected, please log in again"}
Expand Down
7 changes: 5 additions & 2 deletions Code/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ bs4==0.0.1
certifi==2023.11.17
charset-normalizer==3.3.2
click==8.1.3
colorama==0.4.6
dnspython==2.4.2
docx==0.2.4
ecdsa==0.18.0
Expand All @@ -14,7 +13,9 @@ et-xmlfile==1.1.0
fastapi==0.88.0
h11==0.14.0
idna==3.4
lxml==4.9.4
joblib==1.3.2
lxml==4.9.3
nltk==3.8.1
numpy==1.26.2
openpyxl==3.1.2
outcome==1.3.0.post0
Expand All @@ -32,13 +33,15 @@ python-jose==3.3.0
python-json-logger==2.0.7
python-multipart==0.0.5
pytz==2023.3.post1
regex==2023.12.25
requests==2.31.0
rsa==4.9
six==1.16.0
sniffio==1.3.0
sortedcontainers==2.4.0
soupsieve==2.5
starlette==0.22.0
tqdm==4.66.1
trio==0.23.2
trio-websocket==0.11.1
typing_extensions==4.4.0
Expand Down

0 comments on commit ba3d27f

Please sign in to comment.