Module `sentspace.utils.misc`

Expand source code

import concurrent.futures
import hashlib
import math
import pdb
import pickle
from functools import partial
from itertools import chain
from time import time

# import seaborn as sns
import nltk
# import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.io as sio
import scipy.spatial.distance as ssd
from nltk import pos_tag
from scipy.stats import percentileofscore, zscore
from tqdm import tqdm

# from zs import ZS

_START_TIME = time()
def START_TIME(): return _START_TIME

# lemmas=WordNetLemmatizer()


# def import_franklin_sentences_set3(filename):
#     f2g = []
#     with open(filename, 'r') as file:
#         for line in file:
#             if line.startswith('passage') or line == '\n':
#                 pass
#             else:
#                 f2g.append(line.strip('\n').split())
                
#     return f2g

# def import_data(filename, dtype=None):
#     """
#     Import text file with \n after each line.
#     Pre-computed GloVe vectors can be loaded as:
#         glove_embed = import_data('../glove/vectors_243sentences.txt', dtype=lambda x: float(x))

#     """
#     f1g = []
#     with gzip.open(filename+'.zip', 'r') as file:
#         for line in file:
#             tokens = line.split()
#             if dtype:
#                 tokens = [dtype(token) for token in tokens]
#             f1g.append(tokens)
#     return f1g

# def get_wordlst(f1g):
#     """
#     Given list of sentences (each a list of tokens), return single list of tokens
#     """
#     wordlst = []
#     for sentence in f1g:
#         for word in sentence:
#             wordlst.append(word)
#     return wordlst

# def get_sent_num_passsage(f1g, lplst):
#     """
#     Given list of passage no. for each sentence, return sentence no. within passage (for each word)
#     """
#     sent_num = 0
#     snplst = []
#     current_label = lplst[0]
#     for sentence, label in zip(f1g, lplst):
#         sent_num += 1
#         if label != current_label:
#             sent_num = 1
#             current_label = label
#         for word in sentence:
#             snplst.append(sent_num)
#     return snplst



# download NLTK data if not already downloaded
def download_nltk_resources():
    for category, nltk_resource in [('taggers', 'averaged_perceptron_tagger'), 
                                    ('corpora', 'wordnet'),
                                    ('tokenizers', 'punkt'),
                                    ]:
        try:
            nltk.data.find(category+'/'+nltk_resource)
        except LookupError as e:
            try:
                nltk.download(nltk_resource)
            except FileExistsError:
                pass

def md5(fname) -> str:
    '''generates md5sum of the contents of fname
        fname (str): path to file whose md5sum we want
    '''
    hash_md5 = hashlib.md5()
    with open(fname, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

def sha1(ob):
    ob_repr = repr(ob)
    hash_object = hashlib.sha1()
    hash_object.update(ob_repr.encode('utf-8'))
    return hash_object.hexdigest()


def parallelize(function, *iterables, wrap_tqdm=True, desc='', **kwargs):
    """parallelizes a function by calling it on the supplied iterables and (static) kwargs.
       optionally wraps in tqdm for progress visualization 

    Args:
        function ([type]): [description]
        wrap_tqdm (bool, optional): [description]. Defaults to True.
        desc ([type], optional): [description]. Defaults to None.

    Returns:
        [type]: [description]
    """    
    partialfn = partial(function, **kwargs)
    with concurrent.futures.ProcessPoolExecutor() as executor:
        if wrap_tqdm:
            return [*tqdm(executor.map(partialfn, *iterables), total=len(iterables[0]), desc='[parallelized] '+desc)]
        return executor.map(partialfn, *iterables)

# this might be data-dependent
def load_passage_labels(filename):
    """
    Given .mat file, load and return list of passage no. for each sentence
    """
    labelsPassages = sio.loadmat(filename)
    lP = labelsPassages['labelsPassageForEachSentence']
    return lP.flatten()

# this might be data-dependent
def load_passage_categories(filename):
    """
    Given .mat file, load and return list of passage category labels
    """
    labelsPassages = sio.loadmat(filename)
    lP = labelsPassages['keyPassageCategory']
    return list(np.hstack(lP[0]))

def get_passage_labels(f1g, lplst):
    """
    Given list of passage no. for each sentence, return list of passage no. (for each word)
    """
    lplst_word = []
    for i, sentence in enumerate(f1g):
        for word in sentence:
            lplst_word.append(lplst[i])
    return lplst_word

# this might be data-dependent
def load_passage_category(filename):
    """
    Given .mat file, return category no. for each passage
    """
    labelsPassageCategory = sio.loadmat(filename)
    lPC = labelsPassageCategory['labelsPassageCategory']
    lPC = np.hsplit(lPC,1)
    lpclst = np.array(lPC).tolist()
    lpclst = lpclst[0]
    lpclst = list(chain.from_iterable(lpclst)) # Accessing the nested lists
    return lpclst



def merge_lists(list_a, list_b, feature=""):
    '''Input: Two lists with potentially missing values.
       Return: If list 1 contains NA vals, the NA val is replaced by the value in list 2 (either numerical val or np.nan again)
    '''
    # count_a, count_b, count_na = 0, 0, 0
    merged = []

    for val1, val2 in zip(list_a, list_b):
        merged += [val2 if np.isnan(val1) else val1]

        # if not np.isnan(val1):
        #     merged.append(val1)
        #     count_b += 1
        # else:
        #     merged.append(val2)
        #     if not np.isnan(val2):
        #         count_a += 1
        #     else:
        #         count_na += 1
    
    return merged

    n = len(merged)
    print(feature, f"| number of values derived from original form: {count_b}, {count_b/n*100:.2f}%")
    print(feature, f"| number of values derived from lemmatized form: {count_a}, {count_a/n*100:.2f}%")
    print(feature, f"| number of values = NA: {count_na}, {count_na/n*100:.2f}%")
    print('-'*79)


# def compile_results(wordlst, wordlst_l, wordlst_lem, taglst, is_content_lst, setlst,
#                     snlst, lplst_word, snplst, wnslst, catlst, wordlen, merged_vals):
# def compile_results(wordlst, wordlst_l, wordlst_lem, 
#                     taglst, is_content_lst, setlst,
#                     snlst, wordlen, merged_vals):
#     """
#     Return dataframe: each row is a word & its various associated values
#     """
#     result = pd.DataFrame({'Word': wordlst})
#     result['Word cleaned'] = wordlst_l
#     result['Word lemma'] = wordlst_lem

#     result['POS'] = taglst
#     result['Content/function'] = is_content_lst
#     result['Set no.'] = setlst
#     result['Sentence no.'] = snlst
#     #result['Passage no.'] = lplst_word
#     #result['Sentence no. within passage'] = snplst
#     #result['Word no. within sentence'] = wnslst
#     #result['Broad topic'] = catlst
#     result['Specific topic'] = ['']*len(wordlst)
#     result['Word length'] = wordlen
#     result['polysemy'] = merged_vals['polysemy']

#     # List what you want the columns to be called
#     cols = {'NRC_Arousal': 'Arousal', 
#             'NRC_Valence': 'Valence', 
#             'OSC': 'Orthography-Semantics Consistency', 
#             'aoa': 'Age of acquisition', 
#             'concreteness': 'Concreteness',
#             'lexical_decision_RT': 'Lexical decision RT',
#             'log_contextual_diversity': 'Contextual diversity (log)',
#             'log_lexical_frequency': 'Lexical frequency (log)',
#             'n_orthographic_neighbors': 'Frequency of orthographic neighbors',
#             'num_morpheme': 'Number of morphemes',
#             'prevalence': 'Prevalence',
#             'surprisal-3': 'Lexical surprisal',
#             'total_degree_centrality': 'Degree centrality',
#             'polysemy':'Polysemy',
#             'num_morpheme_poly':'Number of morphemes poly',
#             #'Pronoun Ratio':'Pronoun Ratio'
#     }

#     for key, val in cols.items():
#         result[val] = merged_vals[key]
#     return result
    

    
# def conform_word_lex_df_columns(df):
#     # List what you want the columns to be called
#     cols = {'NRC_Arousal': 'Arousal', 
#             'NRC_Valence': 'Valence', 
#             'OSC': 'Orthography-Semantics Consistency', 
#             'aoa': 'Age of acquisition', 
#             'concreteness': 'Concreteness',
#             'lexical_decision_RT': 'Lexical decision RT',
#             'log_contextual_diversity': 'Contextual diversity (log)',
#             'log_lexical_frequency': 'Lexical frequency (log)',
#             'n_orthographic_neighbors': 'Frequency of orthographic neighbors',
#             'num_morpheme': 'Number of morphemes',
#             'prevalence': 'Prevalence',
#             'surprisal-3': 'Lexical surprisal',
#             'total_degree_centrality': 'Degree centrality',
#             'polysemy':'Polysemy',
#             'num_morpheme_poly':'Number of morphemes poly',
#     }
#     df.rename(columns=cols)

#     # Remove empty column that are vestiges of temporary analyses
#     df = df.drop(columns=['Specific topic'])
#     return df

# def transform_features(df, method='default', cols_log=None, cols_z=None):
#     df = df.copy()
#     if method == 'default':
#         cols_log = ['Degree centrality', 'Frequency of orthographic neighbors']
#     if cols_log:
#         for col in cols_log:
#             df[col] = np.log10(df[col].astype('float')+1)
#             df = df.rename({col: col+' (log)'})
#         df = df.rename(columns={col: col+' (log)' for col in cols_log})
#     if cols_z:
#         for col in cols_z:
#             df[col] = zscore(df[col].astype('float'), nan_policy='omit')
#         df = df.rename(columns={col: col+' (z)' for col in cols_z})
#     return df
    
# #     return df_main

# def countNA(lst):
#     """
#     Return number of NAs in a list
#     """
#     return sum(np.isnan(lst))

# def countNA_df(df, features='all'):
#     """
#     Given dataframe of words and feature values
#     Return list of number of NAs in each word's features
#     """
#     if features == 'all':
#         features = ['Age of acquisition', 'Concreteness', 'Prevalence', 'Arousal', 'Valence', 'Dominance', 'Ambiguity: percentage of dominant', 'Log lexical frequency', 'Lexical surprisal', 'Word length']
#     df = df[features]
#     return list(df.isnull().sum(axis=1))

# def uniqueNA(df, feature):
#     """
#     Given dataframe of words and feature values & desired feature,
#     return set of unique words with NA in given feature
#     """
#     return sorted(set(df['Word cleaned'][df[feature].isna()]))

# def avgNA(result, feature):
#     """
#     Return fractions of words with NA (for given feature) in each sentence
#     """
#     return result.groupby('Sentence no.').apply(lambda data: countNA(data[feature])/len(data))

# def get_NA_words(result, wordlst_l, features):
#     """
#     Return list of words that have NA in at least one of the specified features
#     """
#     big_u_lst = []
#     for feature in features:
#         big_u_lst.extend(uniqueNA(result, feature))
#     u_lst = sorted(set(big_u_lst))
#     return (big_u_lst, u_lst)




# def avg_feature(data, feature, method):
#     """
#     Return average value of feature
#     """
#     if method=='strict':
#         data = data.dropna()
#     elif method=='all':
#         pass
#     else:
#         raise ValueError('Method not recognized')
#     return np.nanmean(np.array(data[feature], dtype=float))

# def get_sent_vectors(df, features, method='strict', content_only=False,
#                      save=False, save_path=None, **kwargs):
#     """
#     Return dataframe of sentence embeddings (each row as a sentence)
#     Method:
#         'strict' - if a word has NA in any feature, it is skipped in the sentence average for all features
#         'all' - use all non-NA values for sentence average in any feature
#     content_only - if True, use content words only in a sentence
#     """
#     pronoun_ratios = kwargs.get('pronoun_ratios', None)
#     content_ratios = kwargs.get('content_ratios', None)
    
#     if content_only:
#         df = df[df["Content/function"] == 1]
#     sent_vectors = pd.DataFrame({'Sentence no.': df['Sentence no.'].unique()})
#     df = df[features + ['Sentence no.']].groupby('Sentence no.')
#     for name, feature in zip(features, features):
#         sent_vectors[name] = list(df.apply(lambda data: avg_feature(data, feature, method)))
#     if pronoun_ratios is not None:
#         sent_vectors['Pronoun ratios'] = pronoun_ratios['pronoun_ratio']
#     # if content_ratios is not None:
#     #     sent_vectors['Content ratios'] = content_ratios['content_ratio']
#     if save:
#         sio.savemat(save_path, {'sent_vectors': sent_vectors.drop(columns=['Sentence no.']).to_numpy()})
#     return sent_vectors

# def get_differential_sents(embed1, embed2, n, result, method='euclidean'):
#     """
#     Print sentences with the largest distance between the two input embeddings
#     Return index of these sentences (return 1-indexed; assume sentence no. are 1-indexed)
#     """
#     if method == 'euclidean':
#         func = ssd.euclidean
#     elif method == 'correlation':
#         func = ssd.correlation
#     elif method == 'cosine':
#         func = ssd.cosine
#     else:
#         raise ValueError('Method not implemented')
#     diff = np.array([func(embed1[i], embed2[i]) for i in range(len(embed1))])
#     top_diff_ind = (-diff).argsort(axis=None)[:n]
#     top_diff_sent_no = [i+1 for i in top_diff_ind]
#     print('Sentences with largest differences:', top_diff_sent_no)
#     for i, idx in enumerate(top_diff_ind):
#         sent_no = idx+1
#         sent = result[result['Sentence no.'] == sent_no].sort_values('Word no. within sentence')
#         print(f'{i+1}, sentence {sent_no}: ', list(sent['Word']))
#         # print('Number of NA features for a word:', countNA_df(sent, features='all'))
#         # print(f'Value in embedding 1: {x[idx]}, embedding 2: {y[idx]}')
#         print(f'Distance: {diff[idx]}')
#         print()
#     return top_diff_sent_no






########## PMI Block ##############
def GrabNGrams(sentences, save_paths):
    '''
    save paths is a list = 
        ['output_folder/03252021/PMI/example_pPMI_0.csv', 
        'output_folder/03252021/PMI/example_pPMI_1.csv', 
        'output_folder/03252021/PMI/example_pPMI_2.csv', 
        'output_folder/03252021/PMI/example_ngrams.pkl', 
        'output_folder/03252021/PMI/example_nm1grams.pkl']
    '''
    sample = sentences
    
    google1 = ZS('PMI/google-books-eng-us-all-20120701-1gram.zs')
    google2 = ZS('PMI/google-books-eng-us-all-20120701-2gram.zs')

    #  break sentences into strings
    def populate(sentences):
        ngra = dict()
        nm1gra = dict()
        for sentence in sentences:
            tokens = sentence.lower().split()
            tokens = ['_START_'] + tokens + ['_END_']
            for t in range(0, len(tokens) - 1):
                ngra[(tokens[t], tokens[t + 1])] = 0
                #print 0, (tokens[t], tokens[t + 1])
                nm1gra[tokens[t]] = 0
            for t in range(0, len(tokens) - 2):
                ngra[(tokens[t], tokens[t + 2])] = 0
                #print 1, (tokens[t], tokens[t + 2])
            for t in range(0, len(tokens) - 3):
                ngra[(tokens[t], tokens[t + 3])] = 0
                #print 2, (tokens[t], tokens[t + 3])
            nm1gra[tokens[len(tokens) - 1]] = 0
        for t1, t2 in ngra.copy().keys():
            ngra[(t2, t1)] = 0
        return ngra, nm1gra
    
    ngrams, nm1grams = populate(sample)
    
    #  fetch ngram and n-1gram
    def fetch(ngra, z=google2, zm1=google1):
        ngram_c = 0
        ngram_str = " ".join(ngra)
        #pdb.set_trace()
        for record in z.search(prefix=ngram_str):
            entry = record.split()
            if entry[1] == ngra[1]:
                ngram_c += int(entry[3])
        if nm1grams[ngra[0]] > 0:
            nm1gram_c = nm1grams[ngra[0]]
        else:
            nm1gram_c = 0
            for record in zm1.search(prefix=ngra[0]):
                entry = record.split()
                if entry[0] == ngra[0]:
                    nm1gram_c += int(entry[2])
        return ngram_c, nm1gram_c
    
    surprisals = dict()
    for ngram in ngrams.copy().keys():
        #print ngram
        #pdb.set_trace()
        ngrams[ngram], nm1grams[ngram[0]] = fetch(ngram)
    
    
    #with open(save_path+'/PMI/ngrams.pkl', 'w') as f:
    with open(save_paths[3], 'w') as f:
        pdb.set_trace()
        pickle.dump(ngrams, f)
    
    
    #with open('PMI/nm1grams.pkl', 'w') as f:
    with open(save_paths[4], 'w') as f:
        pickle.dump(nm1grams, f)


def pPMI(sentences, save_paths):
    '''
    save paths is a list = 
        ['output_folder/03252021/PMI/example_pPMI_0.csv', 
        'output_folder/03252021/PMI/example_pPMI_1.csv', 
        'output_folder/03252021/PMI/example_pPMI_2.csv', 
        'output_folder/03252021/PMI/example_ngrams.pkl', 
        'output_folder/03252021/PMI/example_nm1grams.pkl']
    '''
    sample = sentences
    
    with open('PMI/ngrams.pkl', 'r') as f:
        ngrams = pickle.load(f)
    
    
    with open('PMI/nm1grams.pkl', 'r') as f:
        nm1grams = pickle.load(f)
    
    N = 356033418959 # US american english v2 google ngrams
    nm1grams['_START_'] = float(sum([ ngrams[w] for w in ngrams.keys() if w[0] == '_START_']))
    
    
    def calc_prob(sentences, ngra=ngrams, nm1gra=nm1grams, ALPHA=0.1, lag=0):
        assert lag <= 2, 'impossible lag'
        results = []
        Z = len(ngrams.keys())*ALPHA + N
        for sent in sentences:
            string = sent[0]
            tokens = string.lower().split()
            mi = 0
            # No lag
            for t in range(0, len(tokens) - 1):
                joint_c = log(ngra[(tokens[t], tokens[t + 1])] + ngra[(tokens[t + 1], tokens[t])] + ALPHA)
                x_c = log(nm1gra[tokens[t]] + ALPHA * len(ngrams.keys()))
                y_c = log(nm1gra[tokens[t + 1]] + ALPHA * len(ngrams.keys()))
                pmi = max([0, (joint_c + log(Z) - x_c - y_c) / log(2)])
                mi += pmi
            # 1 word lag
            if lag >= 1:
                for t in range(0, len(tokens) - 2):
                    joint_c = log(ngra[(tokens[t], tokens[t + 2])] + ngra[(tokens[t + 2], tokens[t])] + ALPHA)
                    x_c = log(nm1gra[tokens[t]] + ALPHA * len(ngrams.keys()))
                    y_c = log(nm1gra[tokens[t + 2]] + ALPHA * len(ngrams.keys()))
                    pmi = max([0, (joint_c + log(Z) - x_c - y_c) / log(2)])
                    mi += pmi
            # 2 word lag
            if lag >= 2:
                for t in range(0, len(tokens) - 3):
                    joint_c = log(ngra[(tokens[t], tokens[t + 3])] + ngra[(tokens[t + 3], tokens[t])] + ALPHA)
                    x_c = log(nm1gra[tokens[t]] + ALPHA * len(ngrams.keys()))
                    y_c = log(nm1gra[tokens[t + 3]] + ALPHA * len(ngrams.keys()))
                    pmi = max([0,(joint_c + log(Z) - x_c - y_c) / log(2)])
                    mi += pmi
            results.append(','.join(sent[0].strip('\n'), str(mi)))
        return results
    
    
    result = calc_prob(sentences, lag=0)
    printstring = "\n".join(result)
    #with open('PMI/pPMI_0.csv', 'w') as f:
    with open(save_paths[0], 'w') as f:
        f.write(printstring)
    
    result = calc_prob(sentences, lag=1)
    printstring = "\n".join(result)
    #with open('PMI/pPMI_1.csv', 'w') as f:
    with open(save_paths[1], 'w') as f:
        f.write(printstring)
    
    result = calc_prob(sentences, lag=2)
    printstring = "\n".join(result)
    # with open('PMI/pPMI_2.csv', 'w') as f:
    with open(save_paths[2], 'w') as f:
        f.write(printstring)
########## End PMI Block

def sizeof_fmt(num, suffix='B'):
    '''
    This function can be used to print out how big a file is
    '''
    ''' by Fred Cirera,  https://stackoverflow.com/a/1094933/1870254, modified'''
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)

Functions

def GrabNGrams(sentences, save_paths)

save paths is a list = ['output_folder/03252021/PMI/example_pPMI_0.csv', 'output_folder/03252021/PMI/example_pPMI_1.csv', 'output_folder/03252021/PMI/example_pPMI_2.csv', 'output_folder/03252021/PMI/example_ngrams.pkl', 'output_folder/03252021/PMI/example_nm1grams.pkl']

Expand source code

def GrabNGrams(sentences, save_paths):
    '''
    save paths is a list = 
        ['output_folder/03252021/PMI/example_pPMI_0.csv', 
        'output_folder/03252021/PMI/example_pPMI_1.csv', 
        'output_folder/03252021/PMI/example_pPMI_2.csv', 
        'output_folder/03252021/PMI/example_ngrams.pkl', 
        'output_folder/03252021/PMI/example_nm1grams.pkl']
    '''
    sample = sentences
    
    google1 = ZS('PMI/google-books-eng-us-all-20120701-1gram.zs')
    google2 = ZS('PMI/google-books-eng-us-all-20120701-2gram.zs')

    #  break sentences into strings
    def populate(sentences):
        ngra = dict()
        nm1gra = dict()
        for sentence in sentences:
            tokens = sentence.lower().split()
            tokens = ['_START_'] + tokens + ['_END_']
            for t in range(0, len(tokens) - 1):
                ngra[(tokens[t], tokens[t + 1])] = 0
                #print 0, (tokens[t], tokens[t + 1])
                nm1gra[tokens[t]] = 0
            for t in range(0, len(tokens) - 2):
                ngra[(tokens[t], tokens[t + 2])] = 0
                #print 1, (tokens[t], tokens[t + 2])
            for t in range(0, len(tokens) - 3):
                ngra[(tokens[t], tokens[t + 3])] = 0
                #print 2, (tokens[t], tokens[t + 3])
            nm1gra[tokens[len(tokens) - 1]] = 0
        for t1, t2 in ngra.copy().keys():
            ngra[(t2, t1)] = 0
        return ngra, nm1gra
    
    ngrams, nm1grams = populate(sample)
    
    #  fetch ngram and n-1gram
    def fetch(ngra, z=google2, zm1=google1):
        ngram_c = 0
        ngram_str = " ".join(ngra)
        #pdb.set_trace()
        for record in z.search(prefix=ngram_str):
            entry = record.split()
            if entry[1] == ngra[1]:
                ngram_c += int(entry[3])
        if nm1grams[ngra[0]] > 0:
            nm1gram_c = nm1grams[ngra[0]]
        else:
            nm1gram_c = 0
            for record in zm1.search(prefix=ngra[0]):
                entry = record.split()
                if entry[0] == ngra[0]:
                    nm1gram_c += int(entry[2])
        return ngram_c, nm1gram_c
    
    surprisals = dict()
    for ngram in ngrams.copy().keys():
        #print ngram
        #pdb.set_trace()
        ngrams[ngram], nm1grams[ngram[0]] = fetch(ngram)
    
    
    #with open(save_path+'/PMI/ngrams.pkl', 'w') as f:
    with open(save_paths[3], 'w') as f:
        pdb.set_trace()
        pickle.dump(ngrams, f)
    
    
    #with open('PMI/nm1grams.pkl', 'w') as f:
    with open(save_paths[4], 'w') as f:
        pickle.dump(nm1grams, f)

def START_TIME()

Expand source code

def START_TIME(): return _START_TIME

def download_nltk_resources()

Expand source code

def download_nltk_resources():
    for category, nltk_resource in [('taggers', 'averaged_perceptron_tagger'), 
                                    ('corpora', 'wordnet'),
                                    ('tokenizers', 'punkt'),
                                    ]:
        try:
            nltk.data.find(category+'/'+nltk_resource)
        except LookupError as e:
            try:
                nltk.download(nltk_resource)
            except FileExistsError:
                pass

def get_passage_labels(f1g, lplst)

Given list of passage no. for each sentence, return list of passage no. (for each word)

Expand source code

def get_passage_labels(f1g, lplst):
    """
    Given list of passage no. for each sentence, return list of passage no. (for each word)
    """
    lplst_word = []
    for i, sentence in enumerate(f1g):
        for word in sentence:
            lplst_word.append(lplst[i])
    return lplst_word

def load_passage_categories(filename)

Given .mat file, load and return list of passage category labels

Expand source code

def load_passage_categories(filename):
    """
    Given .mat file, load and return list of passage category labels
    """
    labelsPassages = sio.loadmat(filename)
    lP = labelsPassages['keyPassageCategory']
    return list(np.hstack(lP[0]))

def load_passage_category(filename)

Given .mat file, return category no. for each passage

Expand source code

def load_passage_category(filename):
    """
    Given .mat file, return category no. for each passage
    """
    labelsPassageCategory = sio.loadmat(filename)
    lPC = labelsPassageCategory['labelsPassageCategory']
    lPC = np.hsplit(lPC,1)
    lpclst = np.array(lPC).tolist()
    lpclst = lpclst[0]
    lpclst = list(chain.from_iterable(lpclst)) # Accessing the nested lists
    return lpclst

def load_passage_labels(filename)

Given .mat file, load and return list of passage no. for each sentence

Expand source code

def load_passage_labels(filename):
    """
    Given .mat file, load and return list of passage no. for each sentence
    """
    labelsPassages = sio.loadmat(filename)
    lP = labelsPassages['labelsPassageForEachSentence']
    return lP.flatten()

def md5(fname) ‑> str

generates md5sum of the contents of fname fname (str): path to file whose md5sum we want

Expand source code

def md5(fname) -> str:
    '''generates md5sum of the contents of fname
        fname (str): path to file whose md5sum we want
    '''
    hash_md5 = hashlib.md5()
    with open(fname, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

def merge_lists(list_a, list_b, feature='')

Input: Two lists with potentially missing values. Return: If list 1 contains NA vals, the NA val is replaced by the value in list 2 (either numerical val or np.nan again)

Expand source code

def merge_lists(list_a, list_b, feature=""):
    '''Input: Two lists with potentially missing values.
       Return: If list 1 contains NA vals, the NA val is replaced by the value in list 2 (either numerical val or np.nan again)
    '''
    # count_a, count_b, count_na = 0, 0, 0
    merged = []

    for val1, val2 in zip(list_a, list_b):
        merged += [val2 if np.isnan(val1) else val1]

        # if not np.isnan(val1):
        #     merged.append(val1)
        #     count_b += 1
        # else:
        #     merged.append(val2)
        #     if not np.isnan(val2):
        #         count_a += 1
        #     else:
        #         count_na += 1
    
    return merged

    n = len(merged)
    print(feature, f"| number of values derived from original form: {count_b}, {count_b/n*100:.2f}%")
    print(feature, f"| number of values derived from lemmatized form: {count_a}, {count_a/n*100:.2f}%")
    print(feature, f"| number of values = NA: {count_na}, {count_na/n*100:.2f}%")
    print('-'*79)

def pPMI(sentences, save_paths)

Expand source code

def pPMI(sentences, save_paths):
    '''
    save paths is a list = 
        ['output_folder/03252021/PMI/example_pPMI_0.csv', 
        'output_folder/03252021/PMI/example_pPMI_1.csv', 
        'output_folder/03252021/PMI/example_pPMI_2.csv', 
        'output_folder/03252021/PMI/example_ngrams.pkl', 
        'output_folder/03252021/PMI/example_nm1grams.pkl']
    '''
    sample = sentences
    
    with open('PMI/ngrams.pkl', 'r') as f:
        ngrams = pickle.load(f)
    
    
    with open('PMI/nm1grams.pkl', 'r') as f:
        nm1grams = pickle.load(f)
    
    N = 356033418959 # US american english v2 google ngrams
    nm1grams['_START_'] = float(sum([ ngrams[w] for w in ngrams.keys() if w[0] == '_START_']))
    
    
    def calc_prob(sentences, ngra=ngrams, nm1gra=nm1grams, ALPHA=0.1, lag=0):
        assert lag <= 2, 'impossible lag'
        results = []
        Z = len(ngrams.keys())*ALPHA + N
        for sent in sentences:
            string = sent[0]
            tokens = string.lower().split()
            mi = 0
            # No lag
            for t in range(0, len(tokens) - 1):
                joint_c = log(ngra[(tokens[t], tokens[t + 1])] + ngra[(tokens[t + 1], tokens[t])] + ALPHA)
                x_c = log(nm1gra[tokens[t]] + ALPHA * len(ngrams.keys()))
                y_c = log(nm1gra[tokens[t + 1]] + ALPHA * len(ngrams.keys()))
                pmi = max([0, (joint_c + log(Z) - x_c - y_c) / log(2)])
                mi += pmi
            # 1 word lag
            if lag >= 1:
                for t in range(0, len(tokens) - 2):
                    joint_c = log(ngra[(tokens[t], tokens[t + 2])] + ngra[(tokens[t + 2], tokens[t])] + ALPHA)
                    x_c = log(nm1gra[tokens[t]] + ALPHA * len(ngrams.keys()))
                    y_c = log(nm1gra[tokens[t + 2]] + ALPHA * len(ngrams.keys()))
                    pmi = max([0, (joint_c + log(Z) - x_c - y_c) / log(2)])
                    mi += pmi
            # 2 word lag
            if lag >= 2:
                for t in range(0, len(tokens) - 3):
                    joint_c = log(ngra[(tokens[t], tokens[t + 3])] + ngra[(tokens[t + 3], tokens[t])] + ALPHA)
                    x_c = log(nm1gra[tokens[t]] + ALPHA * len(ngrams.keys()))
                    y_c = log(nm1gra[tokens[t + 3]] + ALPHA * len(ngrams.keys()))
                    pmi = max([0,(joint_c + log(Z) - x_c - y_c) / log(2)])
                    mi += pmi
            results.append(','.join(sent[0].strip('\n'), str(mi)))
        return results
    
    
    result = calc_prob(sentences, lag=0)
    printstring = "\n".join(result)
    #with open('PMI/pPMI_0.csv', 'w') as f:
    with open(save_paths[0], 'w') as f:
        f.write(printstring)
    
    result = calc_prob(sentences, lag=1)
    printstring = "\n".join(result)
    #with open('PMI/pPMI_1.csv', 'w') as f:
    with open(save_paths[1], 'w') as f:
        f.write(printstring)
    
    result = calc_prob(sentences, lag=2)
    printstring = "\n".join(result)
    # with open('PMI/pPMI_2.csv', 'w') as f:
    with open(save_paths[2], 'w') as f:
        f.write(printstring)

def parallelize(function, *iterables, wrap_tqdm=True, desc='', **kwargs)

parallelizes a function by calling it on the supplied iterables and (static) kwargs. optionally wraps in tqdm for progress visualization

Args

function : [type]: [description]
wrap_tqdm : bool, optional: [description]. Defaults to True.
desc : [type], optional: [description]. Defaults to None.

Returns

[type]: [description]

Expand source code

def parallelize(function, *iterables, wrap_tqdm=True, desc='', **kwargs):
    """parallelizes a function by calling it on the supplied iterables and (static) kwargs.
       optionally wraps in tqdm for progress visualization 

    Args:
        function ([type]): [description]
        wrap_tqdm (bool, optional): [description]. Defaults to True.
        desc ([type], optional): [description]. Defaults to None.

    Returns:
        [type]: [description]
    """    
    partialfn = partial(function, **kwargs)
    with concurrent.futures.ProcessPoolExecutor() as executor:
        if wrap_tqdm:
            return [*tqdm(executor.map(partialfn, *iterables), total=len(iterables[0]), desc='[parallelized] '+desc)]
        return executor.map(partialfn, *iterables)

def sha1(ob)

Expand source code

def sha1(ob):
    ob_repr = repr(ob)
    hash_object = hashlib.sha1()
    hash_object.update(ob_repr.encode('utf-8'))
    return hash_object.hexdigest()

def sizeof_fmt(num, suffix='B')

This function can be used to print out how big a file is

Expand source code

def sizeof_fmt(num, suffix='B'):
    '''
    This function can be used to print out how big a file is
    '''
    ''' by Fred Cirera,  https://stackoverflow.com/a/1094933/1870254, modified'''
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)