dunnings_coefficients.py

# Reads all volumes meeting a given set of criteria,
# and uses a leave-one-out strategy to distinguish
# reviewed volumes (class 1) from random
# class 0. In cases where an author occurs more
# than once in the dataset, it leaves out all
# volumes by that author whenever making a prediction
# about one of them.

import numpy as np
import pandas as pd
import csv, os, random, sys
from collections import Counter
from multiprocessing import Pool
from sklearn.linear_model import LogisticRegression
import modelingprocess
import metafilter
from scipy.stats import norm
import matplotlib.pyplot as plt
import math
from numpy import array, asarray, ma, zeros, sum
import scipy.special as special
import scipy.linalg as linalg
from scipy.stats import rankdata, tiecorrect, distributions

usedate = False
# Leave this flag false unless you plan major
# surgery to reactivate the currently-deprecated
# option to use "date" as a predictive feature.

# There are three different date types we can use.
# Choose which here.

# FUNCTIONS GET DEFINED BELOW.

def infer_date(metadictentry, datetype):
    if datetype == 'pubdate':
        return metadictentry[datetype]
    elif datetype == 'firstpub':
        firstpub = metadictentry['firstpub']
        if firstpub > 1700 and firstpub < 1950:
            return firstpub
        else:
            return metadictentry['pubdate']
    else:
        sys.exit(0)

def appendif(key, value, dictionary):
    if key in dictionary:
        dictionary[key].append(value)
    else:
        dictionary[key] = [value]

# Clean this up and make it unnecessary.

def dirty_pairtree(htid):
    ''' Changes a 'clean' HathiTrust ID (with only chars that are
    legal in filenames) into a 'clean' version of the same name
    (which may contain illegal chars.)
    '''
    period = htid.find('.')
    prefix = htid[0:period]
    postfix = htid[(period+1): ]
    if '=' in postfix:
        postfix = postfix.replace('+',':')
        postfix = postfix.replace('=','/')
    dirtyname = prefix + "." + postfix
    return dirtyname

def forceint(astring):
    try:
        intval = int(astring)
    except:
        intval = 0

    return intval

def get_features(wordcounts, wordlist):
    numwords = len(wordlist)
    wordvec = np.zeros(numwords)
    for idx, word in enumerate(wordlist):
        if word in wordcounts:
            wordvec[idx] = wordcounts[word]

    return wordvec

# In an earlier version of this script, we sometimes used
# "publication date" as a feature, to see what would happen.
# In the current version, we don't. Some of the functions
# and features remain, but they are deprecated. E.g.:

def get_features_with_date(wordcounts, wordlist, date, totalcount):
    numwords = len(wordlist)
    wordvec = np.zeros(numwords + 1)
    for idx, word in enumerate(wordlist):
        if word in wordcounts:
            wordvec[idx] = wordcounts[word]

    wordvec = wordvec / (totalcount + 0.0001)
    wordvec[numwords] = date
    return wordvec

def sliceframe(dataframe, yvals, excludedrows, testrow):
    numrows = len(dataframe)
    newyvals = list(yvals)
    for i in excludedrows:
        del newyvals[i]
        # NB: This only works if we assume that excluded rows
        # has already been sorted in descending order !!!!!!!

    trainingset = dataframe.drop(dataframe.index[excludedrows])

    newyvals = np.array(newyvals)
    testset = dataframe.iloc[testrow]

    return trainingset, newyvals, testset

def normalizearray(featurearray, usedate):
    '''Normalizes an array by centering on means and
    scaling by standard deviations. Also returns the
    means and standard deviations for features, so that
    they can be pickled.
    '''

    numinstances, numfeatures = featurearray.shape
    means = list()
    stdevs = list()
    lastcolumn = numfeatures - 1
    for featureidx in range(numfeatures):

        thiscolumn = featurearray.iloc[ : , featureidx]
        thismean = np.mean(thiscolumn)

        thisstdev = np.std(thiscolumn)

        if (not usedate) or featureidx != lastcolumn:
            # If we're using date we don't normalize the last column.
            means.append(thismean)
            stdevs.append(thisstdev)
            featurearray.iloc[ : , featureidx] = (thiscolumn - thismean) / thisstdev
        else:
            print('FLAG')
            means.append(thismean)
            thisstdev = 0.1
            stdevs.append(thisstdev)
            featurearray.iloc[ : , featureidx] = (thiscolumn - thismean) / thisstdev
            # We set a small stdev for date.

    return featurearray, means, stdevs

def binormal_select(vocablist, positivecounts, negativecounts, totalpos, totalneg, k):
    ''' A feature-selection option, not currently in use.
    '''
    all_scores = np.zeros(len(vocablist))

    for idx, word in enumerate(vocablist):
        # For each word we create a vector the length of vols in each class
        # that contains real counts, plus zeroes for all those vols not
        # represented.

        positives = np.zeros(totalpos, dtype = 'int64')
        if word in positivecounts:
            positives[0: len(positivecounts[word])] = positivecounts[word]

        negatives = np.zeros(totalneg, dtype = 'int64')
        if word in negativecounts:
            negatives[0: len(negativecounts[word])] = negativecounts[word]

        featuremean = np.mean(np.append(positives, negatives))

        tp = sum(positives > featuremean)
        fp = sum(positives <= featuremean)
        tn = sum(negatives > featuremean)
        fn = sum(negatives <= featuremean)
        tpr = tp/(tp+fn) # true positive ratio
        fpr = fp/(fp+tn) # false positive ratio

        bns_score = abs(norm.ppf(tpr) - norm.ppf(fpr))
        # See Forman

        if np.isinf(bns_score) or np.isnan(bns_score):
            bns_score = 0

        all_scores[idx] = bns_score

    zipped = [x for x in zip(all_scores, vocablist)]
    zipped.sort(reverse = True)
    with open('bnsscores.tsv', mode='w', encoding = 'utf-8') as f:
        for score, word in zipped:
            f.write(word + '\t' + str(score) + '\n')

    return [x[1] for x in zipped[0:k]]

def mannwhitneyu(x, y, use_continuity=True):
    """
    Computes the Mann-Whitney rank test on samples x and y.

    Parameters
    ----------
    x, y : array_like
        Array of samples, should be one-dimensional.
    use_continuity : bool, optional
            Whether a continuity correction (1/2.) should be taken into
            account. Default is True.

    Returns
    -------
    u : float
        The Mann-Whitney statistics.
    prob : float
        One-sided p-value assuming a asymptotic normal distribution.

    Notes
    -----
    Use only when the number of observation in each sample is > 20 and
    you have 2 independent samples of ranks. Mann-Whitney U is
    significant if the u-obtained is LESS THAN or equal to the critical
    value of U.

    This test corrects for ties and by default uses a continuity correction.
    The reported p-value is for a one-sided hypothesis, to get the two-sided
    p-value multiply the returned p-value by 2.

    """
    x = asarray(x)
    y = asarray(y)
    n1 = len(x)
    n2 = len(y)
    ranked = rankdata(np.concatenate((x,y)))
    rankx = ranked[0:n1]       # get the x-ranks
    u1 = n1*n2 + (n1*(n1+1))/2.0 - np.sum(rankx,axis=0)  # calc U for x
    u2 = n1*n2 - u1                            # remainder is U for y
    bigu = max(u1,u2)
    smallu = min(u1,u2)
    T = tiecorrect(ranked)
    if T == 0:
        raise ValueError('All numbers are identical in amannwhitneyu')
    sd = np.sqrt(T*n1*n2*(n1+n2+1)/12.0)

    if use_continuity:
        # normal approximation for prob calc with continuity correction
        z = abs((bigu-0.5-n1*n2/2.0) / sd)
    else:
        z = abs((bigu-n1*n2/2.0) / sd)  # normal approximation for prob calc
    return smallu, distributions.norm.sf(z)  # (1.0 - zprob(z))

def dunnings(random, randsum, reviewed, revsum, colidx):
    randword = sum(random.iloc[ : , colidx])
    revword = sum(reviewed.iloc[ : , colidx])
    wordsum = randword + revword
    expected_rand = randsum * ( wordsum / (randsum + revsum) )
    expected_rev = revsum * ( wordsum / (randsum + revsum) )
    G2 = 0

    if randword > 0:
        G2 += randword * math.log(randword / expected_rand)
    if revword > 0:
        G2 += revword * math.log(revword / expected_rev)

    positives = reviewed.iloc[ : , colidx]
    negatives = random.iloc[ : , colidx]
    featuremean = np.mean(np.append(positives, negatives))

    tp = sum(positives > featuremean)
    fp = sum(positives <= featuremean)
    tn = sum(negatives > featuremean)
    fn = sum(negatives <= featuremean)
    tpr = tp/(tp+fn) # true positive ratio
    fpr = fp/(fp+tn) # false positive ratio

    bns_score = norm.ppf(tpr) - norm.ppf(fpr)
    # See Forman

    if np.isinf(bns_score) or np.isnan(bns_score):
        bns_score = 0

    u, p = mannwhitneyu(positives, negatives)
    p = 0.05 / p
    if revword / expected_rev < 1:
        G2 = -G2
        p = -p
        # we reverse the sign of dunnings if this word is
        # UNDERrepresented in the reviewed set.

    ratio = (randword / randsum) /  (revword / revsum)

    return G2, bns_score, ratio, u, p

def make_dunnings(paths, exclusions, thresholds, classifyconditions):
    ''' This is the main function in the module.
    It can be called externally; it's also called
    if the module is run directly.
    '''

    sourcefolder, extension, classpath, outputpath = paths
    excludeif, excludeifnot, excludebelow, excludeabove, sizecap = exclusions
    pastthreshold, futurethreshold = thresholds
    category2sorton, positive_class, datetype, numfeatures, regularization = classifyconditions

    verbose = False

    if not sourcefolder.endswith('/'):
        sourcefolder = sourcefolder + '/'

    # This just makes things easier.

    # Get a list of files.
    allthefiles = os.listdir(sourcefolder)
    # random.shuffle(allthefiles)

    volumeIDs = list()
    volumepaths = list()

    for filename in allthefiles:

        if filename.endswith(extension):
            volID = filename.replace(extension, "")
            # The volume ID is basically the filename minus its extension.
            # Extensions are likely to be long enough that there is little
            # danger of accidental occurrence inside a filename. E.g.
            # '.fic.tsv'
            path = sourcefolder + filename
            volumeIDs.append(volID)
            volumepaths.append(path)

    metadict = metafilter.get_metadata(classpath, volumeIDs, excludeif, excludeifnot, excludebelow, excludeabove)

    # Now that we have a list of volumes with metadata, we can select the groups of IDs
    # that we actually intend to contrast. If we want to us more or less everything,
    # this may not be necessary. But in some cases we want to use randomly sampled subsets.

    # The default condition here is

    # category2sorton = 'reviewed'
    # positive_class = 'rev'
    # sizecap = 350
    # A sizecap less than one means, no sizecap.

    IDsToUse, classdictionary = metafilter.label_classes(metadict, category2sorton, positive_class, sizecap)

    # make a vocabulary list and a volsize dict
    wordcounts = Counter()

    volspresent = list()
    orderedIDs = list()

    positivecounts = dict()
    negativecounts = dict()

    for volid, volpath in zip(volumeIDs, volumepaths):
        if volid not in IDsToUse:
            continue
        else:
            volspresent.append((volid, volpath))
            orderedIDs.append(volid)

        date = infer_date(metadict[volid], datetype)
        if date < pastthreshold or date > futurethreshold:
            continue
        else:
            with open(volpath, encoding = 'utf-8') as f:
                for line in f:
                    fields = line.strip().split('\t')
                    if len(fields) > 2 or len(fields) < 2:
                        # print(line)
                        continue
                    word = fields[0]
                    if len(word) > 0 and word[0].isalpha():
                        count = int(fields[1])
                        wordcounts[word] += 1
                        # for initial feature selection we use the number of
                        # *documents* that contain a given word,
                        # so it's just +=1.

    vocablist = [x[0] for x in wordcounts.most_common(numfeatures)]

    # vocablist = binormal_select(vocablist, positivecounts, negativecounts, totalposvols, totalnegvols, 3000)
    # Feature selection is deprecated. There are cool things
    # we could do with feature selection,
    # but they'd improve accuracy by 1% at the cost of complicating our explanatory task.
    # The tradeoff isn't worth it. Explanation is more important.
    # So we just take the most common words (by number of documents containing them)
    # in the whole corpus. Technically, I suppose, we could crossvalidate that as well,
    # but *eyeroll*.

    donttrainon = list()

    # Here we create a list of volumed IDs not to be used for training.
    # For instance, we have supplemented the dataset with volumes that
    # are in the Norton but that did not actually occur in random
    # sampling. We want to make predictions for these, but never use
    # them for training.

    for idx1, anid in enumerate(orderedIDs):
        reviewedstatus = metadict[anid]['reviewed']
        date = infer_date(metadict[anid], datetype)
        if reviewedstatus == 'addedbecausecanon':
            donttrainon.append(idx1)
        elif date < pastthreshold or date > futurethreshold:
            donttrainon.append(idx1)

    authormatches = [list(donttrainon) for x in range(len(orderedIDs))]
    # For every index in authormatches, identify a set of indexes that have
    # the same author. Obvs, there will always be at least one.

    # Since we are going to use these indexes to exclude rows, we also add
    # all the ids in donttrainon to every volume

    for idx1, anid in enumerate(orderedIDs):
        thisauthor = metadict[anid]['author']
        for idx2, anotherid in enumerate(orderedIDs):
            otherauthor = metadict[anotherid]['author']
            if thisauthor == otherauthor and not idx2 in authormatches[idx1]:
                authormatches[idx1].append(idx2)

    for alist in authormatches:
        alist.sort(reverse = True)

    # I am reversing the order of indexes so that I can delete them from
    # back to front, without changing indexes yet to be deleted.
    # This will become important in the modelingprocess module.

    randomdata = list()
    revieweddata = list()

    for volid, volpath in volspresent:

        with open(volpath, encoding = 'utf-8') as f:
            voldict = dict()
            totalcount = 0
            for line in f:
                fields = line.strip().split('\t')
                if len(fields) > 2 or len(fields) < 2:
                    continue

                word = fields[0]
                count = int(fields[1])
                voldict[word] = count
                totalcount += count

        date = infer_date(metadict[volid], datetype)
        date = date - 1700
        if date < 0:
            date = 0

        classflag = classdictionary[volid]
        features = get_features(voldict, vocablist)
        if classflag == 0:
            randomdata.append(features)
        else:
            revieweddata.append(features)

    randomdata = pd.DataFrame(randomdata)
    revieweddata = pd.DataFrame(revieweddata)

    randomrows = randomdata.shape[0]
    randsum = 0
    for i in range(randomrows):
        randsum += sum(randomdata.iloc[i, : ])

    reviewedrows = revieweddata.shape[0]
    revsum = 0
    for i in range(reviewedrows):
        revsum += sum(revieweddata.iloc[i, : ])

    dunningdict = dict()
    for idx, word in enumerate(vocablist):
        signed_dunnings, bns, ratio, mwu, mwp = dunnings(randomdata, randsum, revieweddata, revsum, idx)
        dunningdict[word] = (signed_dunnings, bns, ratio, mwu, mwp)

    with open('dunnings.csv', mode = 'w', encoding = 'utf-8') as f:
        writer = csv.DictWriter(f, fieldnames = ['word', 'dunnings', 'bns', 'ratio', 'mwu', 'mwp'])
        writer.writeheader()
        for word, value in dunningdict.items():
            row = dict()
            row['word'] = word
            row['dunnings'] = value[0]
            row['bns'] = value[1]
            row['ratio'] = value[2]
            row['mwu'] = value[3]
            row['mwp'] = value[4]
            writer.writerow(row)

def diachronic_tilt(allvolumes, modeltype, datelimits):
    ''' Takes a set of predictions produced by a model that knows nothing about date,
    and divides it along a line with a diachronic tilt. We need to do this in a way
    that doesn't violate crossvalidation. I.e., we shouldn't "know" anything
    that the model didn't know. We tried a couple of different ways to do this, but
    the simplest and actually most reliable is to divide the whole dataset along a
    linear central trend line for the data!
    '''

    listofrows = list()
    classvector = list()

    # DEPRECATED
    # if modeltype == 'logistic' and len(datelimits) == 2:
    #     # In this case we construct a subset of data to model on.
    #     tomodeldata = list()
    #     tomodelclasses = list()
    #     pastthreshold, futurethreshold = datelimits

    for volume in allvolumes:
        date = volume[3]
        logistic = volume[8]
        realclass = volume[13]
        listofrows.append([logistic, date])
        classvector.append(realclass)

        # DEPRECATED
        # if modeltype == 'logistic' and len(datelimits) == 2:
        #     if date >= pastthreshold and date <= futurethreshold:
        #         tomodeldata.append([logistic, date])
        #         tomodelclasses.append(realclass)

    y, x = [a for a in zip(*listofrows)]
    plt.axis([min(x) - 2, max(x) + 2, min(y) - 0.02, max(y) + 0.02])
    reviewedx = list()
    reviewedy = list()
    randomx = list()
    randomy = list()

    for idx, reviewcode in enumerate(classvector):
        if reviewcode == 1:
            reviewedx.append(x[idx])
            reviewedy.append(y[idx])
        else:
            randomx.append(x[idx])
            randomy.append(y[idx])

    plt.plot(reviewedx, reviewedy, 'ro')
    plt.plot(randomx, randomy, 'k+')

    if modeltype == 'logistic':
        # all this is DEPRECATED
        print("Hey, you're attempting to use the logistic-tilt option")
        print("that we deactivated. Go in and uncomment the code.")

        # if len(datelimits) == 2:
        #     data = pd.DataFrame(tomodeldata)
        #     responsevariable = tomodelclasses
        # else:
        #     data = pd.DataFrame(listofrows)
        #     responsevariable = classvector

        # newmodel = LogisticRegression(C = 100000)
        # newmodel.fit(data, responsevariable)
        # coefficients = newmodel.coef_[0]

        # intercept = newmodel.intercept_[0] / (-coefficients[0])
        # slope = coefficients[1] / (-coefficients[0])

        # p = np.poly1d([slope, intercept])

    elif modeltype == 'linear':
        # what we actually do

        z = np.polyfit(x, y, 1)
        p = np.poly1d(z)
        slope = z[0]
        intercept = z[1]

    plt.plot(x,p(x),"b-")
    plt.show(block = False)

    x = np.array(x, dtype='float64')
    y = np.array(y, dtype='float64')
    classvector = np.array(classvector)
    dividingline = intercept + (x * slope)
    predicted_as_reviewed = (y > dividingline)
    really_reviewed = (classvector == 1)

    accuracy = sum(predicted_as_reviewed == really_reviewed) / len(classvector)

    return accuracy

if __name__ == '__main__':

    # If this class is called directly, it creates a single model using the default
    # settings set below.

    ## PATHS.

    # sourcefolder = '/Users/tunder/Dropbox/GenreProject/python/reception/fiction/texts/'
    # extension = '.fic.tsv'
    # classpath = '/Users/tunder/Dropbox/GenreProject/python/reception/fiction/masterficmeta.csv'
    # outputpath = '/Users/tunder/Dropbox/GenreProject/python/reception/fiction/predictions.csv'

    sourcefolder = 'poems/'
    extension = '.poe.tsv'
    classpath = 'poemeta.csv'
    outputpath = 'logisticpredictions.csv'

    # We can simply exclude volumes from consideration on the basis on any
    # metadata category we want, using the dictionaries defined below.

    ## EXCLUSIONS.

    excludeif = dict()
    excludeif['pubname'] = 'TEM'
    # We're not using reviews from Tait's.

    excludeif['recept'] = 'addcanon'
    # We don't ordinarily include canonical volumes that were not in either sample.
    # These are included only if we're testing the canon specifically.

    excludeifnot = dict()
    excludeabove = dict()
    excludebelow = dict()

    excludebelow['inferreddate'] = 1800
    excludeabove['inferreddate'] = 1950
    sizecap = 360

    # For more historically-interesting kinds of questions, we can limit the part
    # of the dataset that gets TRAINED on, while permitting the whole dataset to
    # be PREDICTED. (Note that we always exclude authors from their own training
    # set; this is in addition to that.) The variables futurethreshold and
    # pastthreshold set the chronological limits of the training set, inclusive
    # of the threshold itself.

    ## THRESHOLDS

    pastthreshold = -1
    futurethreshold = 2000

    # CLASSIFY CONDITIONS

    positive_class = 'rev'
    category2sorton = 'reviewed'
    datetype = 'firstpub'
    numfeatures = 3200
    regularization = .00007


    paths = (sourcefolder, extension, classpath, outputpath)
    exclusions = (excludeif, excludeifnot, excludebelow, excludeabove, sizecap)
    thresholds = (pastthreshold, futurethreshold)
    classifyconditions = (category2sorton, positive_class, datetype, numfeatures, regularization)

    make_dunnings(paths, exclusions, thresholds, classifyconditions)