Module sentspace.syntax
Expand source code
from collections import defaultdict
import os
import sentspace
from pathlib import Path
import pandas as pd
from nltk.tree import ParentedTree
from sentspace.syntax import utils
from sentspace.syntax.features import DLT, Feature, LeftCorner, Tree
from sentspace.utils import io, text
from sentspace.utils.caching import cache_to_disk
__pdoc__ = {
"compute_tree_dlt_left_corner": False,
"utils.calcEmbd": False,
"utils.calcDLT": False,
"utils.printlemmas": False,
"utils.tree": False,
}
os.environ["PERL_BADLANG"] = "0"
def get_features(
sentence: sentspace.Sentence.Sentence,
# identifier: str = None,
dlt: bool = True,
left_corner: bool = True,
syntax_port: int = 8000,
) -> dict:
"""Obtains contextual/syntactic features for `sentence`
Args:
sentence (`sentspace.Sentence.Sentence`): a single instance of Sentence to compute features for.
dlt (bool, optional): whether to calculate Syntactic Integration related Dependency
Lexicality Theory (DLT) features [False].
left_corner (bool, optional): whether to calculate embedding depth and similar related
Left Corner features [False].
Returns:
sentspace.syntax.features.Feature: a Feature instance with appropriate attributes
"""
# if the "sentence" actually consists of multiple sentences (as determined by the
# NLTK Punkt Sentence Tokenizer), we want to repeat the below block per sentence and
# then pool across sentences
from nltk.tokenize import sent_tokenize
stripped = "".join([i if ord(i) < 128 else "" for i in str(sentence)])
sentences = sent_tokenize(stripped, language="english")
features_to_pool = defaultdict(list)
features = None
dlt_concat, left_corner_concat = None, None
for i, sub_sentence in enumerate(sentences):
features = Feature()
if dlt or left_corner:
# io.log(f'parsing into syntax tree: `{sentence}`')
# parsed = parse_input(sentence)
try:
server_url = f"http://localhost:{syntax_port}/fullberk"
features.tree = Tree(
utils.compute_trees(sub_sentence, server_url=server_url)
)
if type(features.tree) is RuntimeError:
raise features.tree
getattr(features.tree, "raw")
# print(parse_input(sentence), features.tree)
if dlt and features.tree.raw is not None:
# io.log(f'computing DLT feature')
dlt_stdout = utils.compute_feature("dlt.sh", features.tree.raw)
if type(dlt_stdout) == RuntimeError:
raise dlt_stdout
else:
features.dlt = DLT(dlt_stdout, sub_sentence, sentence.uid)
# io.log(f'--- done: DLT')
if left_corner and features.tree.raw is not None:
# io.log(f'computing left corner feature')
left_corner_stdout = utils.compute_feature(
"leftcorner.sh", features.tree.raw
)
if type(left_corner_stdout) == RuntimeError:
raise left_corner_stdout
else:
features.left_corner = LeftCorner(
left_corner_stdout, sub_sentence, sentence.uid
)
# io.log(f'--- done: left corner')
features_to_pool["dlt"] += [features.dlt]
features_to_pool["left_corner"] += [features.left_corner]
except AttributeError as ae:
import traceback
io.log(
f"FAILED: AttributeError while processing "
f"Tree [{features.tree}] features for chunk [{sub_sentence}] of sentence [{sentence}] "
f"traceback: {traceback.format_exc()}",
type="ERR",
)
# for attr in ['dlt', 'left_corner', 'tree']:
# io.log(f'hasattr(features, {attr}): {hasattr(features, attr)}', type='ERR')
# io.log(f'hasattr(features.tree, raw): {hasattr(features.tree, "raw")}', type='ERR')
pass
except RuntimeError:
io.log(
f"FAILED: RuntimeError to process Tree features for chunk [{sub_sentence}] of sentence [{sentence}]",
type="ERR",
)
pass
# do groupby index and mean() here to merge all features for the same sentence into one
# row and then carry on (because we first split them into sub-parts based on punctuation)
if dlt:
try:
dlt_concat = pd.concat(features_to_pool["dlt"], axis="index")
dlt_concat = dlt_concat.groupby("index").mean()
dlt_concat["sentence"] = str(sentence)
except ValueError:
dlt_concat = pd.DataFrame()
else:
dlt_concat = None
if left_corner:
try:
left_corner_concat = pd.concat(
features_to_pool["left_corner"], axis="index"
)
left_corner_concat = left_corner_concat.groupby("index").mean()
left_corner_concat["sentence"] = str(sentence)
except ValueError:
left_corner_concat = pd.DataFrame()
else:
left_corner_concat = None
# tokenized = utils.tokenize(sub_sentence).split()
# tagged_sentence = text.get_pos_tags(tokenized)
is_content_word = utils.get_is_content(
sentence.pos_tags, content_pos=text.pos_for_content
) # content or function word
pronoun_ratio = utils.get_pronoun_ratio(sentence.pos_tags)
content_ratio = utils.get_content_ratio(is_content_word)
return {
"index": sentence.uid,
"sentence": str(sentence),
"pronoun_ratio": pronoun_ratio,
"content_ratio": content_ratio,
# 'tree': features.tree
"dlt": dlt_concat,
"leftcorner": left_corner_concat,
}
Sub-modules
sentspace.syntax.features
sentspace.syntax.utils
Functions
def get_features(sentence: Sentence, dlt: bool = True, left_corner: bool = True, syntax_port: int = 8000) ‑> dict
-
Obtains contextual/syntactic features for
sentence
Args
- sentence (
Sentence
): a single instance of Sentence to compute features for. dlt
:bool
, optional- whether to calculate Syntactic Integration related Dependency Lexicality Theory (DLT) features [False].
left_corner
:bool
, optional- whether to calculate embedding depth and similar related Left Corner features [False].
Returns
Feature
- a Feature instance with appropriate attributes
Expand source code
def get_features( sentence: sentspace.Sentence.Sentence, # identifier: str = None, dlt: bool = True, left_corner: bool = True, syntax_port: int = 8000, ) -> dict: """Obtains contextual/syntactic features for `sentence` Args: sentence (`sentspace.Sentence.Sentence`): a single instance of Sentence to compute features for. dlt (bool, optional): whether to calculate Syntactic Integration related Dependency Lexicality Theory (DLT) features [False]. left_corner (bool, optional): whether to calculate embedding depth and similar related Left Corner features [False]. Returns: sentspace.syntax.features.Feature: a Feature instance with appropriate attributes """ # if the "sentence" actually consists of multiple sentences (as determined by the # NLTK Punkt Sentence Tokenizer), we want to repeat the below block per sentence and # then pool across sentences from nltk.tokenize import sent_tokenize stripped = "".join([i if ord(i) < 128 else "" for i in str(sentence)]) sentences = sent_tokenize(stripped, language="english") features_to_pool = defaultdict(list) features = None dlt_concat, left_corner_concat = None, None for i, sub_sentence in enumerate(sentences): features = Feature() if dlt or left_corner: # io.log(f'parsing into syntax tree: `{sentence}`') # parsed = parse_input(sentence) try: server_url = f"http://localhost:{syntax_port}/fullberk" features.tree = Tree( utils.compute_trees(sub_sentence, server_url=server_url) ) if type(features.tree) is RuntimeError: raise features.tree getattr(features.tree, "raw") # print(parse_input(sentence), features.tree) if dlt and features.tree.raw is not None: # io.log(f'computing DLT feature') dlt_stdout = utils.compute_feature("dlt.sh", features.tree.raw) if type(dlt_stdout) == RuntimeError: raise dlt_stdout else: features.dlt = DLT(dlt_stdout, sub_sentence, sentence.uid) # io.log(f'--- done: DLT') if left_corner and features.tree.raw is not None: # io.log(f'computing left corner feature') left_corner_stdout = utils.compute_feature( "leftcorner.sh", features.tree.raw ) if type(left_corner_stdout) == RuntimeError: raise left_corner_stdout else: features.left_corner = LeftCorner( left_corner_stdout, sub_sentence, sentence.uid ) # io.log(f'--- done: left corner') features_to_pool["dlt"] += [features.dlt] features_to_pool["left_corner"] += [features.left_corner] except AttributeError as ae: import traceback io.log( f"FAILED: AttributeError while processing " f"Tree [{features.tree}] features for chunk [{sub_sentence}] of sentence [{sentence}] " f"traceback: {traceback.format_exc()}", type="ERR", ) # for attr in ['dlt', 'left_corner', 'tree']: # io.log(f'hasattr(features, {attr}): {hasattr(features, attr)}', type='ERR') # io.log(f'hasattr(features.tree, raw): {hasattr(features.tree, "raw")}', type='ERR') pass except RuntimeError: io.log( f"FAILED: RuntimeError to process Tree features for chunk [{sub_sentence}] of sentence [{sentence}]", type="ERR", ) pass # do groupby index and mean() here to merge all features for the same sentence into one # row and then carry on (because we first split them into sub-parts based on punctuation) if dlt: try: dlt_concat = pd.concat(features_to_pool["dlt"], axis="index") dlt_concat = dlt_concat.groupby("index").mean() dlt_concat["sentence"] = str(sentence) except ValueError: dlt_concat = pd.DataFrame() else: dlt_concat = None if left_corner: try: left_corner_concat = pd.concat( features_to_pool["left_corner"], axis="index" ) left_corner_concat = left_corner_concat.groupby("index").mean() left_corner_concat["sentence"] = str(sentence) except ValueError: left_corner_concat = pd.DataFrame() else: left_corner_concat = None # tokenized = utils.tokenize(sub_sentence).split() # tagged_sentence = text.get_pos_tags(tokenized) is_content_word = utils.get_is_content( sentence.pos_tags, content_pos=text.pos_for_content ) # content or function word pronoun_ratio = utils.get_pronoun_ratio(sentence.pos_tags) content_ratio = utils.get_content_ratio(is_content_word) return { "index": sentence.uid, "sentence": str(sentence), "pronoun_ratio": pronoun_ratio, "content_ratio": content_ratio, # 'tree': features.tree "dlt": dlt_concat, "leftcorner": left_corner_concat, }
- sentence (