import re
from lxml import etree
def strip_punct(text: str) -> str:
"""Strips punctuation from a given string
Adapted from nltk Penn Treebank tokenizer
:param str: The raw string
:return: The stripped string
"""
# starting quotes
text = re.sub(r"^[\"\']", r"", text)
text = re.sub(r"(``)", r"", text)
text = re.sub(r'([ (\[{<])"', r"", text)
# punctuation
text = re.sub(r"\.\.\.", r"", text)
text = re.sub(r"[,;:@#$%&]", r"", text)
text = re.sub(r'([^\.])(\.)([\]\)}>"\']*)\s*$', r"\1", text)
text = re.sub(r"[?!]", r"", text)
text = re.sub(r"([^'])' ", r"", text)
# parens, brackets, etc.
text = re.sub(r"[\]\[\(\)\{\}\<\>]", r"", text)
text = re.sub(r"--", r"", text)
# ending quotes
text = re.sub(r'"', "", text)
text = re.sub(r"(\S)(\'\'?)", r"\1", text)
return text.strip()
def is_balanced_html(text: str) -> bool:
"""Return False if text contains un-balanced HTML, otherwise True."""
# fast check for strings without angle brackets
if not ("<" in text or ">" in text):
return True
# lxml will throw an error while parsing if the string is unbalanced
try:
etree.fromstring(f"
{text}
")
return True
except etree.XMLSyntaxError:
return False
def wrap_html_tags(text: str, before: str, after: str):
"""Wrap any html tags in text with before and after strings."""
return re.sub(r"(<[^>]+>)", rf"{before}\1{after}", text)
def hyperscan_match(regexes, text):
"""Run regexes on text using hyperscan, for debugging."""
# import here so the dependency is optional
import hyperscan # pylint: disable=import-outside-toplevel
flags = [hyperscan.HS_FLAG_SOM_LEFTMOST] * len(regexes)
regexes = [regex.encode("utf8") for regex in regexes]
hyperscan_db = hyperscan.Database()
hyperscan_db.compile(expressions=regexes, flags=flags)
matches = []
def on_match(index, start, end, flags, context):
matches.append((index, start, end, flags, context))
hyperscan_db.scan(text.encode("utf8"), on_match)
return matches
class HashableDict(dict):
"""Dict that works as an attribute of a hashable dataclass."""
def __hash__(self):
return hash(frozenset(self.items()))
def dump_citations(citations, text, context_chars=30):
"""Dump citations extracted from text, for debugging. Example:
>>> text = "blah. Foo v. Bar, 1 U.S. 1, 2 (1999). blah"
>>> dump_citations(get_citations(text), text)
blah. Foo v. Bar, 1 U.S. 1, 2 (1999). blah
* FullCaseCitation
* reporter_found='U.S.'
* pin_cite='2'
* groups={'volume': '1', 'reporter': 'U.S.', 'page': '1'}
* ...
"""
out = []
green_fmt = "\x1B[32m"
blue_fmt = "\x1B[94m"
bold_fmt = "\x1B[1m"
end_fmt = "\x1B[0m"
for citation in citations:
start, end = citation.span()
context_before = text[max(0, start - context_chars) : start]
context_before = context_before.split("\n")[-1].lstrip()
matched_text = text[start:end]
context_after = text[end : end + context_chars]
context_after = context_after.split("\n")[0].rstrip()
out.append(
f"{green_fmt}{citation.__class__.__name__}:{end_fmt} "
f"{context_before}"
f"{blue_fmt}{bold_fmt}{matched_text}{end_fmt}"
f"{context_after}"
)
for key, value in citation.dump().items():
if value:
if isinstance(value, dict):
out.append(f" * {key}")
for sub_key, sub_value in value.items():
out.append(f" * {sub_key}={repr(sub_value)}")
else:
out.append(f" * {key}={repr(value)}")
return "\n".join(out)