Module sentspace.syntax.utils
Expand source code
import json
import os
import sentspace
import subprocess
import time
import urllib
from pathlib import Path
from urllib import request
import http
from nltk.corpus import wordnet
from sentspace.utils import caching, io, text
def path_decorator(func):
"""Decorator that changes to and from the directory containing scripts
for running DLT and Left Corner metrics before and after a function
call respectively.
"""
def wrapped(*args, **kwargs):
''' function that changes the directory to an expected directory,
executes original function with supplied args, and changes
back to the same directory we started from
'''
previous_pwd = os.getcwd()
target = Path(__file__)
os.chdir(str(target.parent))
result = func(*args, **kwargs)
os.chdir(previous_pwd)
return result.decode('utf-8').strip()
return wrapped
def get_content_ratio(is_content_pos_tag: tuple):
"""
given boolean list corresponding to a token being a content word,
calculate the content ratio
"""
return sum(is_content_pos_tag) / len(is_content_pos_tag)
def get_pronoun_ratio(pos_tags: tuple):
"""
Given sentence calculate the pronoun ratio
"""
pronoun_tags = {'PRP', 'PRP$', 'WP', 'WP$'}
return sum(tag in pronoun_tags for tag in pos_tags) / len(pos_tags)
# @cache_to_mem
def get_is_content(taglst: tuple, content_pos=(wordnet.ADJ, wordnet.VERB, wordnet.NOUN, wordnet.ADV)):
"""
Given list of POS tags, return list of 1 - content word, 0 - not content word
"""
return tuple(int(text.get_wordnet_pos(tag) in content_pos) for tag in taglst)
@path_decorator
def tokenize(raw):
cmd = ['bash', 'tokenize.sh', raw.strip()]
# io.log(f'calling tokenizer like so: `{cmd}`')
tokens = subprocess.check_output(cmd)
# io.log(f'---done--- tokenizer returned output like so: `{tokens}`')
return tokens
@path_decorator
# @caching.cache_to_disk
def compute_trees(sentence, server_url='http://localhost:8000/fullberk'):
# data = f'{{ "sentence": "{sentence}" }}'
data = {"sentence": sentence}
r = request.Request(server_url, data=bytes(json.dumps(data), 'utf-8'), method='GET',
headers={'Content-Type': 'application/json'})
retries = 3
for attempt in range(1, retries+1):
try:
with request.urlopen(r) as rq:
response = rq.read()
except urllib.error.HTTPError as e:
io.log(f'encountered HTTPError on sentence [{sentence}]; attempt {attempt}/{retries}; are you sure the server is running? https://github.com/aalok-sathe/berkeley-interact for instructions',
type='ERR')
print(e)
time.sleep(6)
continue
exit(1)
except urllib.error.URLError as e:
io.log(f'encountered URLError on sentence [{sentence}]; attempt {attempt}/{retries}', type='ERR')
print(e)
time.sleep(6)
continue
exit(1)
except http.client.RemoteDisconnected as e:
io.log(f'encountered http.client.RemoteDisconnected on sentence [{sentence}]; attempt {attempt}/{retries}', type='ERR')
print(e)
time.sleep(6)
continue
exit(1)
break
else:
return RuntimeError(f'failed to process [{sentence}] after {retries} retries')
cmd = ['bash', 'postprocess_trees.sh', response]
# fallback to manually initializing parser
# cmd = ['bash', 'parse_trees.sh', tokens]
trees = subprocess.check_output(cmd)
return trees
@path_decorator
def compute_feature(feature, trees):
cmd = ['bash', feature, trees]
try:
completed = subprocess.run(cmd, check=True, capture_output=True)
except subprocess.CalledProcessError as e:
print('ERROR', e.output, e.returncode, sep='\n')
# NOTE we are NOT raising an error here as that messes up the completion of @path_decorator
# instead we will *return* the Exception instance to be raised at another time
return RuntimeError(e.output)
return completed.stdout
# out = subprocess.check_output(cmd, stderr=subprocess.DEVNULL)
# return out
Functions
def compute_feature(*args, **kwargs)
-
function that changes the directory to an expected directory, executes original function with supplied args, and changes back to the same directory we started from
Expand source code
def wrapped(*args, **kwargs): ''' function that changes the directory to an expected directory, executes original function with supplied args, and changes back to the same directory we started from ''' previous_pwd = os.getcwd() target = Path(__file__) os.chdir(str(target.parent)) result = func(*args, **kwargs) os.chdir(previous_pwd) return result.decode('utf-8').strip()
def compute_trees(*args, **kwargs)
-
function that changes the directory to an expected directory, executes original function with supplied args, and changes back to the same directory we started from
Expand source code
def wrapped(*args, **kwargs): ''' function that changes the directory to an expected directory, executes original function with supplied args, and changes back to the same directory we started from ''' previous_pwd = os.getcwd() target = Path(__file__) os.chdir(str(target.parent)) result = func(*args, **kwargs) os.chdir(previous_pwd) return result.decode('utf-8').strip()
def get_content_ratio(is_content_pos_tag: tuple)
-
given boolean list corresponding to a token being a content word, calculate the content ratio
Expand source code
def get_content_ratio(is_content_pos_tag: tuple): """ given boolean list corresponding to a token being a content word, calculate the content ratio """ return sum(is_content_pos_tag) / len(is_content_pos_tag)
def get_is_content(taglst: tuple, content_pos=('a', 'v', 'n', 'r'))
-
Given list of POS tags, return list of 1 - content word, 0 - not content word
Expand source code
def get_is_content(taglst: tuple, content_pos=(wordnet.ADJ, wordnet.VERB, wordnet.NOUN, wordnet.ADV)): """ Given list of POS tags, return list of 1 - content word, 0 - not content word """ return tuple(int(text.get_wordnet_pos(tag) in content_pos) for tag in taglst)
def get_pronoun_ratio(pos_tags: tuple)
-
Given sentence calculate the pronoun ratio
Expand source code
def get_pronoun_ratio(pos_tags: tuple): """ Given sentence calculate the pronoun ratio """ pronoun_tags = {'PRP', 'PRP$', 'WP', 'WP$'} return sum(tag in pronoun_tags for tag in pos_tags) / len(pos_tags)
def path_decorator(func)
-
Decorator that changes to and from the directory containing scripts for running DLT and Left Corner metrics before and after a function call respectively.
Expand source code
def path_decorator(func): """Decorator that changes to and from the directory containing scripts for running DLT and Left Corner metrics before and after a function call respectively. """ def wrapped(*args, **kwargs): ''' function that changes the directory to an expected directory, executes original function with supplied args, and changes back to the same directory we started from ''' previous_pwd = os.getcwd() target = Path(__file__) os.chdir(str(target.parent)) result = func(*args, **kwargs) os.chdir(previous_pwd) return result.decode('utf-8').strip() return wrapped
def tokenize(*args, **kwargs)
-
function that changes the directory to an expected directory, executes original function with supplied args, and changes back to the same directory we started from
Expand source code
def wrapped(*args, **kwargs): ''' function that changes the directory to an expected directory, executes original function with supplied args, and changes back to the same directory we started from ''' previous_pwd = os.getcwd() target = Path(__file__) os.chdir(str(target.parent)) result = func(*args, **kwargs) os.chdir(previous_pwd) return result.decode('utf-8').strip()