-
Notifications
You must be signed in to change notification settings - Fork 0
/
analysis.py
123 lines (83 loc) · 3.09 KB
/
analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# %%
## Importing packages
import numpy as np
import pandas as pd
import re
import os
import sys
import time
import json
import pickle
from pathlib import Path
import matplotlib.pyplot as plt
# %%
## Reading pickles
input_dir = 'data/processed/'
# %%
## Reading filtered movies_df from pickle
open_file = open('pickles/fmovies_df.pkl', "rb")
fmovies_df = pickle.load(open_file)
# %%
## Reading bechdel_df
with open('pickles/bechdel.pkl','rb') as f:
bechdel_df = pickle.load(f)
# %%
## Adding paragraph and dialogue count to primary dataframe
# Counting pgs and dgs
pg_dg_count = []
with open('data/movie_dialogues.txt','r') as f:
for i, row in enumerate(f):
movie_json = json.loads(row)
movie_index = movie_json['movie_id']
pg_len = len(movie_json['paragraphs'])
dg_len = 0
char_dg_len = 0
for pg in movie_json['paragraphs']:
dg_len = dg_len + len(pg['dialogues'])
char_dg_len = char_dg_len + len([j for j in pg['dialogues'] if j['character'] != 'NA'])
pg_dg_count.append([movie_index, pg_len, dg_len, char_dg_len])
# %%
# Creating df
pg_dg_count_df = pd.DataFrame(pg_dg_count, columns = ['movie_id','pg_count','dg_count','char_dg_count'])
pg_dg_count_df = pg_dg_count_df.set_index('movie_id', drop = True)
# %%
## Merging with primary movie df and bechdel_df
analysis_df = pd.DataFrame.merge(fmovies_df, pg_dg_count_df, how = 'inner', left_index=True, right_index=True)
analysis_df = pd.DataFrame.merge(analysis_df, bechdel_df, how='inner', left_index=True, right_index=True)
# %%
## Dropping duplicates: only keeping the script for a movie that had the most dialogues and paragraphs
analysis_df = analysis_df[analysis_df['dg_count'] == analysis_df.groupby('imdb_id')['dg_count'].transform('max')]
analysis_df = analysis_df[analysis_df['pg_count'] == analysis_df.groupby('imdb_id')['pg_count'].transform('max')]
# %%
## Looking at number of paragraphs
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
print(analysis_df['dg_count'].sort_values(ascending = True))
# %%
## Dropoff 1% lowest quantile
analysis_df = analysis_df[analysis_df['dg_count'] > analysis_df['dg_count'].quantile(0.01)]
# %%
## Saving df to pickle
with open('pickles/analysis.pkl','wb') as f:
pickle.dump(analysis_df, f)
# %%
## Saving to csv for further analysis
analysis_df.to_csv('data/analysis.csv')
# %%
## Loading back pickle
with open('pickles/analysis.pkl','rb') as f:
analysis_df = pickle.load(f)
# %%
## Looking at movies per year
plt.hist(analysis_df['imdb_year'].sort_values(), bins = 100)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
print(analysis_df['imdb_year'].value_counts().sort_index())
# %%
## Save to csv for further analysis
analysis_df['imdb_year'].value_counts().sort_index().to_csv("data/hist.csv")
# %%
analysis_df_test = analysis_df.copy()[['imdb_year','not_count']]
# %%
analysis_df_test['not_count'] = (analysis_df_test['not_count'] > 0)#.apply(int)
# %%
analysis_df_test.groupby(['imdb_year','not_count']).size().to_csv('data/timeseries.csv')
# %%