Module sentspace.lexical.utils
Expand source code
import os
import pickle
import numpy as np
import sentspace.utils
from sentspace.utils import io, text
from sentspace.utils.caching import cache_to_mem # , cache_to_disk
from sentspace.utils.misc import merge_lists
from pathlib import Path
# --------- Lexical features
# list of acceptable feature terms to load_databases(...)
# @cache_to_mem
def get_feature_list():
return [
"NRC_Arousal", # (Mohammad 2018)
"NRC_Valence", # (Mohammad 2018)
"NRC_Dominance", # (Mohammad 2018)
"OSC", # Orthographic-semantics consistency (Marelli & Amenta, 2018)
"aoa", # Age of Acquisition (Kuperman et al., 2012)
"concreteness", # (Brysbaert et al., 2014)
"lexical_decision_RT", # (Balota et al., 2007)
"log_contextual_diversity",
"log_lexical_frequency",
"n_orthographic_neighbors",
"num_morpheme",
"prevalence",
"surprisal-1",
"surprisal-2",
"surprisal-3",
"surprisal-4",
"total_degree_centrality",
"imageability", # Glasgow norms (Scott et al, 2019) https://link.springer.com/article/10.3758/s13428-018-1099-3#Sec1
"body-object-interaction", # (Pexman et al, 2019) https://link.springer.com/article/10.3758/s13428-018-1171-z#Sec9
"zipf", # SUBTLEXus
"socialness", # (Diveica et al., 2022) https://link.springer.com/article/10.3758/s13428-022-01810-x#Sec15
# Lancaster norms (Lynott et al, 2020) https://link.springer.com/article/10.3758/s13428-019-01316-z#Bib1
"Auditory",
"Gustatory",
"Interoceptive",
"Haptic",
"Olfactory",
"Visual",
"Foot_leg",
"Hand_arm",
"Head",
"Mouth",
"Torso",
]
def get_feature_list_using_third_party_libraries():
return ["polysemy", "num_morpheme_poly"]
def get_feature_list_requiring_calculation():
return ["PMI"]
def get_user_contributed_features():
return []
def get_all_features(sentence: "sentspace.Sentence.Sentence", databases):
"""
Given list of words, return dict mapping feature to list of feature values
"""
result = {}
for feature in (
get_feature_list()
+ get_feature_list_using_third_party_libraries()
+ get_user_contributed_features()
):
# we don't want to compute num_morpheme using the dictionary DB by default.
# we want to do it only if the polyglot library is unavailable.
if feature == "num_morpheme":
continue
computed_feature = get_feature(sentence, feature, databases)
# even though we are computing "num_morpheme_poly", we want to
# record it as "num_morpheme", since the _poly suffix comes from the polyglot library
# that provides the feature
if feature == "num_morpheme_polyglot":
try:
import polyglot
except ImportError:
feature = "num_morpheme"
result[feature] = computed_feature
return result
def get_feature(sentence: "sentspace.Sentence.Sentence", feature, databases={}):
"""
get specific `feature` for the tokens in `sentence`; fall back to using `lemmas` if necessary
"""
def get_feature_(token, lemma, feature):
"""given a `word` and a feature to extract, returns the value of that
feature for the `word` using available databases
Args:
word (str): the token (word) to extract a feature for
feature (str): name identifier of the feature acc to predefined convention
databases (dict, in-scope): dictionary of feature --> (word --> feature_value) dictionaries.
Defaults to {}.
Returns:
Any: feature value
"""
# if the feature is from a database we have on disk
# database can be a dictionary or an object that implements
# get(key, default)
if feature in get_feature_list():
feature_dict = databases[feature]
try:
return feature_dict[token]
except KeyError as e:
try:
return feature_dict[lemma]
except KeyError as e_:
return np.nan
# Other databases we use from libraries we load such as NLTK-Wordnet and Polyglot
elif feature in get_feature_list_using_third_party_libraries():
if feature == "polysemy":
from nltk.corpus import wordnet
# first try it with the token itself
if synsets := wordnet.synsets(token):
return len(synsets) # TODO does a word's synset include itself?
# if token is OOV, try again with the lemma
elif synsets := wordnet.synsets(lemma):
return len(synsets)
# otherwise the len of its synset is 1 (itself)
return 1
elif feature == "num_morpheme_poly":
try:
from polyglot.text import Word
# try first to obtain # morphemes of the token
if morphed := Word(token, language="en").morphemes:
return len(morphed)
# otherwise, try using the lemmatized form
elif morphed := Word(lemma, language="en").morphemes:
return len(morphed)
# if both token and lemma OOV, then return nan? or 1 (i.e. full word is the morpheme?)
# but that only means we failed to analyze its morphology, not necessarily that is
# *is* a single morpheme
return 1 # np.nan
except ImportError as e:
# fall back to simply using a dictionary-based feature
# TODO make a note of this somewhere
io.log(e.msg, type="WARN")
return get_feature_(token, lemma, "num_morpheme")
else:
raise ValueError(f"unable to compute unknown feature `{feature}`")
features_list = []
for token, lemma in zip(sentence.tokens, sentence.lemmas):
features_list += [get_feature_(token, lemma, feature)]
return features_list
def return_percentile_df(bench_df, usr_df):
# Initialize df
perc_df = pd.DataFrame(columns=usr_df.columns)
# For each sentence get the percentile scores for each feature
for index, row in usr_df.iterrows():
temp_df = {}
# Iterate through the features
for col in usr_df.columns:
if col == "Sentence no.":
temp_df[col] = row[col]
continue
# print(percentileofscore(bench_df[col],row[col]))
temp_df[col] = percentileofscore(bench_df[col], row[col])
# pdb.set_trace()
# Append percentile feature row
perc_df = perc_df.append(temp_df, ignore_index=True)
perc_df.drop(columns=["Sentence no."])
return perc_df
@cache_to_mem
def load_databases(
features="all",
path="~/.cache/sentspace/",
ignore_case=True,
):
"""
Load dicts mapping word to feature value
If one feature, provide in list format
"""
path = str(Path(path).expanduser().resolve()) + "/"
io.log("loading databases with all features")
databases = {}
if features == "all":
features = get_feature_list()
for feature in features:
if not os.path.exists(path + feature + ".pkl"):
sentspace.utils.s3.load_feature(key=feature + ".pkl", root_dir=path)
with open(path + feature + ".pkl", "rb") as f:
d = pickle.load(f)
# if ignore_case: # add lowercase version to feature database
# for key, val in d.copy().items():
# d[str(key).lower()] = val
databases[feature] = d
sanity_check_databases(databases)
io.log("---done--- loading databases with all features")
return databases
def sanity_check_databases(databases):
"""
perform sanity checks upon loading various datasets
to ensure validity of the loaded data
"""
assert databases["NRC_Arousal"]["happy"] == 0.735
assert databases["NRC_Valence"]["happy"] == 1
assert databases["OSC"]["happy"] == 0.951549893181384
assert abs(databases["aoa"]["a"] - 2.893384) < 1e-4
assert databases["concreteness"]["roadsweeper"] == 4.85
# assert abs(databases['imag']['abbey'] - 5.344) < 1e-4
assert databases["total_degree_centrality"]["a"] == 30
assert databases["lexical_decision_RT"]["a"] == 798.917
assert abs(databases["log_contextual_diversity"]["a"] - 3.9234) < 1e-4
assert abs(databases["log_lexical_frequency"]["a"] - 6.0175) < 1e-4
assert databases["n_orthographic_neighbors"]["a"] == 950.59
assert databases["num_morpheme"]["abbreviated"] == 4
assert abs(databases["prevalence"]["a"] - 1.917) < 1e-3
assert databases["surprisal-3"]["beekeeping"] == 10.258
Functions
def get_all_features(sentence: Sentence, databases)
-
Given list of words, return dict mapping feature to list of feature values
Expand source code
def get_all_features(sentence: "sentspace.Sentence.Sentence", databases): """ Given list of words, return dict mapping feature to list of feature values """ result = {} for feature in ( get_feature_list() + get_feature_list_using_third_party_libraries() + get_user_contributed_features() ): # we don't want to compute num_morpheme using the dictionary DB by default. # we want to do it only if the polyglot library is unavailable. if feature == "num_morpheme": continue computed_feature = get_feature(sentence, feature, databases) # even though we are computing "num_morpheme_poly", we want to # record it as "num_morpheme", since the _poly suffix comes from the polyglot library # that provides the feature if feature == "num_morpheme_polyglot": try: import polyglot except ImportError: feature = "num_morpheme" result[feature] = computed_feature return result
def get_feature(sentence: Sentence, feature, databases={})
-
get specific
feature
for the tokens insentence
; fall back to usinglemmas
if necessaryExpand source code
def get_feature(sentence: "sentspace.Sentence.Sentence", feature, databases={}): """ get specific `feature` for the tokens in `sentence`; fall back to using `lemmas` if necessary """ def get_feature_(token, lemma, feature): """given a `word` and a feature to extract, returns the value of that feature for the `word` using available databases Args: word (str): the token (word) to extract a feature for feature (str): name identifier of the feature acc to predefined convention databases (dict, in-scope): dictionary of feature --> (word --> feature_value) dictionaries. Defaults to {}. Returns: Any: feature value """ # if the feature is from a database we have on disk # database can be a dictionary or an object that implements # get(key, default) if feature in get_feature_list(): feature_dict = databases[feature] try: return feature_dict[token] except KeyError as e: try: return feature_dict[lemma] except KeyError as e_: return np.nan # Other databases we use from libraries we load such as NLTK-Wordnet and Polyglot elif feature in get_feature_list_using_third_party_libraries(): if feature == "polysemy": from nltk.corpus import wordnet # first try it with the token itself if synsets := wordnet.synsets(token): return len(synsets) # TODO does a word's synset include itself? # if token is OOV, try again with the lemma elif synsets := wordnet.synsets(lemma): return len(synsets) # otherwise the len of its synset is 1 (itself) return 1 elif feature == "num_morpheme_poly": try: from polyglot.text import Word # try first to obtain # morphemes of the token if morphed := Word(token, language="en").morphemes: return len(morphed) # otherwise, try using the lemmatized form elif morphed := Word(lemma, language="en").morphemes: return len(morphed) # if both token and lemma OOV, then return nan? or 1 (i.e. full word is the morpheme?) # but that only means we failed to analyze its morphology, not necessarily that is # *is* a single morpheme return 1 # np.nan except ImportError as e: # fall back to simply using a dictionary-based feature # TODO make a note of this somewhere io.log(e.msg, type="WARN") return get_feature_(token, lemma, "num_morpheme") else: raise ValueError(f"unable to compute unknown feature `{feature}`") features_list = [] for token, lemma in zip(sentence.tokens, sentence.lemmas): features_list += [get_feature_(token, lemma, feature)] return features_list
def get_feature_list()
-
Expand source code
def get_feature_list(): return [ "NRC_Arousal", # (Mohammad 2018) "NRC_Valence", # (Mohammad 2018) "NRC_Dominance", # (Mohammad 2018) "OSC", # Orthographic-semantics consistency (Marelli & Amenta, 2018) "aoa", # Age of Acquisition (Kuperman et al., 2012) "concreteness", # (Brysbaert et al., 2014) "lexical_decision_RT", # (Balota et al., 2007) "log_contextual_diversity", "log_lexical_frequency", "n_orthographic_neighbors", "num_morpheme", "prevalence", "surprisal-1", "surprisal-2", "surprisal-3", "surprisal-4", "total_degree_centrality", "imageability", # Glasgow norms (Scott et al, 2019) https://link.springer.com/article/10.3758/s13428-018-1099-3#Sec1 "body-object-interaction", # (Pexman et al, 2019) https://link.springer.com/article/10.3758/s13428-018-1171-z#Sec9 "zipf", # SUBTLEXus "socialness", # (Diveica et al., 2022) https://link.springer.com/article/10.3758/s13428-022-01810-x#Sec15 # Lancaster norms (Lynott et al, 2020) https://link.springer.com/article/10.3758/s13428-019-01316-z#Bib1 "Auditory", "Gustatory", "Interoceptive", "Haptic", "Olfactory", "Visual", "Foot_leg", "Hand_arm", "Head", "Mouth", "Torso", ]
def get_feature_list_requiring_calculation()
-
Expand source code
def get_feature_list_requiring_calculation(): return ["PMI"]
def get_feature_list_using_third_party_libraries()
-
Expand source code
def get_feature_list_using_third_party_libraries(): return ["polysemy", "num_morpheme_poly"]
def get_user_contributed_features()
-
Expand source code
def get_user_contributed_features(): return []
def return_percentile_df(bench_df, usr_df)
-
Expand source code
def return_percentile_df(bench_df, usr_df): # Initialize df perc_df = pd.DataFrame(columns=usr_df.columns) # For each sentence get the percentile scores for each feature for index, row in usr_df.iterrows(): temp_df = {} # Iterate through the features for col in usr_df.columns: if col == "Sentence no.": temp_df[col] = row[col] continue # print(percentileofscore(bench_df[col],row[col])) temp_df[col] = percentileofscore(bench_df[col], row[col]) # pdb.set_trace() # Append percentile feature row perc_df = perc_df.append(temp_df, ignore_index=True) perc_df.drop(columns=["Sentence no."]) return perc_df
def sanity_check_databases(databases)
-
perform sanity checks upon loading various datasets to ensure validity of the loaded data
Expand source code
def sanity_check_databases(databases): """ perform sanity checks upon loading various datasets to ensure validity of the loaded data """ assert databases["NRC_Arousal"]["happy"] == 0.735 assert databases["NRC_Valence"]["happy"] == 1 assert databases["OSC"]["happy"] == 0.951549893181384 assert abs(databases["aoa"]["a"] - 2.893384) < 1e-4 assert databases["concreteness"]["roadsweeper"] == 4.85 # assert abs(databases['imag']['abbey'] - 5.344) < 1e-4 assert databases["total_degree_centrality"]["a"] == 30 assert databases["lexical_decision_RT"]["a"] == 798.917 assert abs(databases["log_contextual_diversity"]["a"] - 3.9234) < 1e-4 assert abs(databases["log_lexical_frequency"]["a"] - 6.0175) < 1e-4 assert databases["n_orthographic_neighbors"]["a"] == 950.59 assert databases["num_morpheme"]["abbreviated"] == 4 assert abs(databases["prevalence"]["a"] - 1.917) < 1e-3 assert databases["surprisal-3"]["beekeeping"] == 10.258