MasterScrat · MasterScrat · Jan 22, 2020 · Jan 22, 2020 · Jan 22, 2020
diff --git a/export.py b/export.py
@@ -49,7 +49,7 @@ def get_f_name(compressed):
         elif args.format == 'csv':
             df.to_csv(f_name, index=False, compression=compression)
         elif args.format == 'pkl':
-            with open(f_name, 'wb') as f:
+            with open(f_name, 'wb', encoding="utf8") as f:
                 pickle.dump(df, f)
         else:
             raise Exception(f'Format {args.format} is not supported.')

diff --git a/parse.py b/parse.py
@@ -1,6 +1,6 @@
-import argparse
-import sys
 import logging.config
+import sys
+
 from parsers.config import config
 from utils import ArgParseDefault
 
@@ -24,7 +24,8 @@ def add_common_parse_arguments(parser):
 
 
 def str2bool(v):
-    return v != 'false';
+    return v != 'false'
+
 
 class ArgParse():
     def __init__(self):

diff --git a/parsers/config.py b/parsers/config.py
@@ -6,7 +6,7 @@
 def get_config():
     log = logging.getLogger(__name__)
     # basic config
-    with open('config.yml', 'r') as config_file:
+    with open('config.yml', 'r', encoding="utf8") as config_file:
         config = yaml.safe_load(config_file)
     # secrets
     for env_var in ['TELEGRAM_API_ID', 'TELEGRAM_API_HASH', 'TELEGRAM_PHONE']:

diff --git a/parsers/messenger.py b/parsers/messenger.py
@@ -42,7 +42,7 @@ def parse_messages(file_path, own_name):
             conversation_id = root.split('/')[-1]
             conversation_with_name = None
             document = os.path.join(root, filename)
-            with open(document) as f:
+            with open(document, encoding="utf8") as f:
                 json_data = json.load(f)
             if "messages" not in json_data or "participants" not in json_data:
                 log.warning(f"Missing messages or participant list in conversation {conversation_id}")
@@ -89,7 +89,7 @@ def infer_own_name(file_path, min_conversations=2):
             if not filename.endswith('.json'):
                 continue
             document_path = os.path.join(root, filename)
-            with open(document_path, 'r') as f:
+            with open(document_path, 'r', encoding="utf8") as f:
                 json_data = json.load(f)
             if "participants" not in json_data:
                 continue

diff --git a/parsers/whatsapp.py b/parsers/whatsapp.py
@@ -1,27 +1,28 @@
-import pandas as pd
-from parsers.utils import export_dataframe, detect_language
-from parsers.config import config
-import logging
 import glob
+import logging
 import os
+from collections import defaultdict
+
+import pandas as pd
 import re
-import math
-from datetime import datetime
 import uuid
-from collections import defaultdict
 from tqdm import tqdm
 
+from parsers.config import config
+from parsers.utils import export_dataframe, detect_language
+
 log = logging.getLogger(__name__)
 regex_left = r'[\u0000-\u001F\u0100-\uFFFF]?'
 regex_datetime = r'[^\w]?([0-9./\-]{6,10},?[\sT][0-9:]{5,8})[^\w]?\s[\-]?\s?'
 regex_right = r'(([^:]+):\s)?(.*)'
 regex_message = re.compile(f'^{regex_left}{regex_datetime}{regex_right}$')
 MAX_EXPORTED_MESSAGES = 1000000
 
+
 def infer_datetime_regex(f_path, max_messages=100):
     regex_message = re.compile(f'^{regex_left}({regex_datetime}){regex_right}$')
     patterns = defaultdict(int)
-    with open(f_path, 'r') as f:
+    with open(f_path, 'r', encoding="utf8") as f:
         for c, line in enumerate(f):
             if c == max_messages:
                 break;
@@ -31,7 +32,7 @@ def infer_datetime_regex(f_path, max_messages=100):
                 first = True
                 last = 0
                 nums = 0
-                for i,l in enumerate(matches.group(1)):
+                for i, l in enumerate(matches.group(1)):
                     if l in '0123456789':
                         if first:
                             pattern += '('
@@ -58,6 +59,7 @@ def infer_datetime_regex(f_path, max_messages=100):
         regex_dt = regex_datetime
     return re.compile(f'^{regex_left}{regex_dt}{regex_right}$')
 
+
 def main(own_name, file_path, max_exported_messages, infer_datetime):
     global MAX_EXPORTED_MESSAGES
     MAX_EXPORTED_MESSAGES = max_exported_messages
@@ -81,6 +83,7 @@ def main(own_name, file_path, max_exported_messages, infer_datetime):
     export_dataframe(df, config['whatsapp']['OUTPUT_PICKLE_NAME'])
     log.info('Done.')
 
+
 def parse_messages(files, own_name, infer_datetime):
     data = []
     for f_path in files:
@@ -92,8 +95,8 @@ def parse_messages(files, own_name, infer_datetime):
         text = None
         if infer_datetime:
             regex_message = infer_datetime_regex(f_path)
-        num_lines = sum(1 for _ in open(f_path, 'r'))
-        with open(f_path, 'r') as f:
+        num_lines = sum(1 for _ in open(f_path, 'r', encoding="utf8"))
+        with open(f_path, 'r', encoding="utf8") as f:
             for line in tqdm(f, total=num_lines):
                 # try to extract meta data from line
                 matches = regex_message.search(line)
@@ -147,6 +150,7 @@ def parse_messages(files, own_name, infer_datetime):
         data.extend(conversation_data)
     return data
 
+
 def infer_own_name(files, min_conversations=2):
     """Infers own name from multiple conversations by finding the person who participated most in the conversations"""
     if len(files) < min_conversations:
@@ -157,7 +161,7 @@ def infer_own_name(files, min_conversations=2):
     log.info('Trying to infer own_name from data...')
     for f_path in files:
         participants = set()
-        with open(f_path, 'r') as f:
+        with open(f_path, 'r', encoding="utf8") as f:
             for line in f:
                 matches = regex_message.search(line)
                 if not matches or not matches.group(3):

diff --git a/visualizers/utils.py b/visualizers/utils.py
@@ -21,7 +21,7 @@ def get_stopwords(stopword_paths):
     stopwords = []
     for stopword_path in stopword_paths:
         log.info(f'Loading stopwords from {stopword_path}...')
-        with open(stopword_path, 'r') as f:
+        with open(stopword_path, 'r', encoding="utf8") as f:
             stopword_data = json.load(f)
         stopwords.extend(stopword_data)
     stopwords = list(set(stopwords))