Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Whatsapp parsing, hopefully #50

Merged
merged 2 commits into from
Jan 22, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion export.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def get_f_name(compressed):
elif args.format == 'csv':
df.to_csv(f_name, index=False, compression=compression)
elif args.format == 'pkl':
with open(f_name, 'wb') as f:
with open(f_name, 'wb', encoding="utf8") as f:
pickle.dump(df, f)
else:
raise Exception(f'Format {args.format} is not supported.')
Expand Down
7 changes: 4 additions & 3 deletions parse.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import argparse
import sys
import logging.config
import sys

from parsers.config import config
from utils import ArgParseDefault

Expand All @@ -24,7 +24,8 @@ def add_common_parse_arguments(parser):


def str2bool(v):
return v != 'false';
return v != 'false'


class ArgParse():
def __init__(self):
Expand Down
2 changes: 1 addition & 1 deletion parsers/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
def get_config():
log = logging.getLogger(__name__)
# basic config
with open('config.yml', 'r') as config_file:
with open('config.yml', 'r', encoding="utf8") as config_file:
config = yaml.safe_load(config_file)
# secrets
for env_var in ['TELEGRAM_API_ID', 'TELEGRAM_API_HASH', 'TELEGRAM_PHONE']:
Expand Down
4 changes: 2 additions & 2 deletions parsers/messenger.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def parse_messages(file_path, own_name):
conversation_id = root.split('/')[-1]
conversation_with_name = None
document = os.path.join(root, filename)
with open(document) as f:
with open(document, encoding="utf8") as f:
json_data = json.load(f)
if "messages" not in json_data or "participants" not in json_data:
log.warning(f"Missing messages or participant list in conversation {conversation_id}")
Expand Down Expand Up @@ -89,7 +89,7 @@ def infer_own_name(file_path, min_conversations=2):
if not filename.endswith('.json'):
continue
document_path = os.path.join(root, filename)
with open(document_path, 'r') as f:
with open(document_path, 'r', encoding="utf8") as f:
json_data = json.load(f)
if "participants" not in json_data:
continue
Expand Down
28 changes: 16 additions & 12 deletions parsers/whatsapp.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,28 @@
import pandas as pd
from parsers.utils import export_dataframe, detect_language
from parsers.config import config
import logging
import glob
import logging
import os
from collections import defaultdict

import pandas as pd
import re
import math
from datetime import datetime
import uuid
from collections import defaultdict
from tqdm import tqdm

from parsers.config import config
from parsers.utils import export_dataframe, detect_language

log = logging.getLogger(__name__)
regex_left = r'[\u0000-\u001F\u0100-\uFFFF]?'
regex_datetime = r'[^\w]?([0-9./\-]{6,10},?[\sT][0-9:]{5,8})[^\w]?\s[\-]?\s?'
regex_right = r'(([^:]+):\s)?(.*)'
regex_message = re.compile(f'^{regex_left}{regex_datetime}{regex_right}$')
MAX_EXPORTED_MESSAGES = 1000000


def infer_datetime_regex(f_path, max_messages=100):
regex_message = re.compile(f'^{regex_left}({regex_datetime}){regex_right}$')
patterns = defaultdict(int)
with open(f_path, 'r') as f:
with open(f_path, 'r', encoding="utf8") as f:
for c, line in enumerate(f):
if c == max_messages:
break;
Expand All @@ -31,7 +32,7 @@ def infer_datetime_regex(f_path, max_messages=100):
first = True
last = 0
nums = 0
for i,l in enumerate(matches.group(1)):
for i, l in enumerate(matches.group(1)):
if l in '0123456789':
if first:
pattern += '('
Expand All @@ -58,6 +59,7 @@ def infer_datetime_regex(f_path, max_messages=100):
regex_dt = regex_datetime
return re.compile(f'^{regex_left}{regex_dt}{regex_right}$')


def main(own_name, file_path, max_exported_messages, infer_datetime):
global MAX_EXPORTED_MESSAGES
MAX_EXPORTED_MESSAGES = max_exported_messages
Expand All @@ -81,6 +83,7 @@ def main(own_name, file_path, max_exported_messages, infer_datetime):
export_dataframe(df, config['whatsapp']['OUTPUT_PICKLE_NAME'])
log.info('Done.')


def parse_messages(files, own_name, infer_datetime):
data = []
for f_path in files:
Expand All @@ -92,8 +95,8 @@ def parse_messages(files, own_name, infer_datetime):
text = None
if infer_datetime:
regex_message = infer_datetime_regex(f_path)
num_lines = sum(1 for _ in open(f_path, 'r'))
with open(f_path, 'r') as f:
num_lines = sum(1 for _ in open(f_path, 'r', encoding="utf8"))
with open(f_path, 'r', encoding="utf8") as f:
for line in tqdm(f, total=num_lines):
# try to extract meta data from line
matches = regex_message.search(line)
Expand Down Expand Up @@ -147,6 +150,7 @@ def parse_messages(files, own_name, infer_datetime):
data.extend(conversation_data)
return data


def infer_own_name(files, min_conversations=2):
"""Infers own name from multiple conversations by finding the person who participated most in the conversations"""
if len(files) < min_conversations:
Expand All @@ -157,7 +161,7 @@ def infer_own_name(files, min_conversations=2):
log.info('Trying to infer own_name from data...')
for f_path in files:
participants = set()
with open(f_path, 'r') as f:
with open(f_path, 'r', encoding="utf8") as f:
for line in f:
matches = regex_message.search(line)
if not matches or not matches.group(3):
Expand Down
2 changes: 1 addition & 1 deletion visualizers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def get_stopwords(stopword_paths):
stopwords = []
for stopword_path in stopword_paths:
log.info(f'Loading stopwords from {stopword_path}...')
with open(stopword_path, 'r') as f:
with open(stopword_path, 'r', encoding="utf8") as f:
stopword_data = json.load(f)
stopwords.extend(stopword_data)
stopwords = list(set(stopwords))
Expand Down