# *** Helpers for building regexes: *** import regex as re def space_boundaries_re(regex): """Wrap regex with space or end of string.""" return rf"(?:^|\s)({regex})(?:\s|$)" def strip_punctuation_re(regex): """Wrap regex with punctuation pattern.""" return rf"{PUNCTUATION_REGEX}{regex}{PUNCTUATION_REGEX}" def nonalphanum_boundaries_re(regex): """Wrap regex to require non-alphanumeric characters on left and right.""" return rf"(?:^|[^a-zA-Z0-9])({regex})(?:[^a-zA-Z0-9]|$)" def short_cite_re(regex): """Convert a full citation regex into a short citation regex. Currently this just means we turn '(?P...),? (?P...' to '(?P...),? at (?P...'""" return re.sub( r""" # reporter group: ( \(\?P[^)]+\) ) (?:,\?)?\ # comma and space # page group: ( \(\?P ) """, r"\1,? at \2", regex, flags=re.VERBOSE, ) # *** Tokenizer regexes: *** # Regexes used from tokenizers.py # We need a regex that matches roman numerals but not the empty string, # without using lookahead assertions that aren't supported by hyperscan. # We *don't* want to match roman numerals 'v', 'l', or 'c', or numerals over # 200, or uppercase, as these are usually false positives # (see https://github.com/freelawproject/eyecite/issues/56 ). # Match roman numerals 1 to 199 except for 5, 50, 100: ROMAN_NUMERAL_REGEX = "|".join( [ # 10-199, but not 50-59 or 100-109 or 150-159: r"c?(?:xc|xl|l?x{1,3})(?:ix|iv|v?i{0,3})", # 1-9, 51-59, 101-109, 151-159, but not 5, 55, 105, 155: r"(?:c?l?)(?:ix|iv|v?i{1,3})", # 55, 105, 150, 155: r"(?:lv|cv|cl|clv)", ] ) # Page number regex to match one of the following: # (ordered in descending order of likelihood) # 1) A plain digit. E.g. "123" # 2) A roman numeral. PAGE_NUMBER_REGEX = rf"(?:\d+|{ROMAN_NUMERAL_REGEX})" # Regex to match punctuation around volume numbers and stopwords. # This could potentially be more precise. PUNCTUATION_REGEX = r"[^\sa-zA-Z0-9]*" # Regex for IdToken ID_REGEX = space_boundaries_re(r"id\.,?|ibid\.") # Regex for SupraToken SUPRA_REGEX = space_boundaries_re(strip_punctuation_re("supra")) # Regex for StopWordToken STOP_WORDS = ( "v", "re", "parte", "denied", "citing", "aff'd", "affirmed", "remanded", "see", "granted", "dismissed", ) STOP_WORD_REGEX = space_boundaries_re( strip_punctuation_re(rf'(?P{"|".join(STOP_WORDS)})') ) # Regex for SectionToken SECTION_REGEX = r"(\S*§\S*)" # Regex for ParagraphToken PARAGRAPH_REGEX = r"(\n)" # *** Metadata regexes: *** # Regexes used to scan forward or backward from a citation token. NOTE: # * Regexes are written in verbose mode. Intentional spaces must be escaped. # * In many regexes order matters: options separated by "|" are # tested left to right, so more specific (typically longer) have to come # before less specific. # Parenthetical regex: # Capture a parenthetical after a cite, like " (overruling Foo)" PARENTHETICAL_REGEX = r""" (?: # optional space, opening paren \ ?\( # capture until last end paren, we'll trim off extra afterwards (?P.*) \) )? """ MONTH_REGEX = r""" (?P Jan\.|Feb\.|Mar\.|Apr\.|May|June| July|Aug\.|Sept\.|Oct\.|Nov\.|Dec\. ) """ YEAR_REGEX = r""" (?: (?P \d{4} ) # Year is occasionally a range, like "1993-94" or "2005-06". # For now we ignore the end of the range: (?:-\d{2})? ) """ # Pin cite regex: # A pin cite is the part of a citation used to specify a particular section of # the referenced document. These may have prefixes, may include paragraph, # page, or line references, and may have multiple ranges specified. # For some examples see # https://github.com/freelawproject/courtlistener/issues/1344#issuecomment-662994948 PIN_CITE_TOKEN_REGEX = r""" # optional label (longest to shortest): (?: (?: (?:&\ )?note| # note, & note (?:&\ )?nn?\.?| # n., nn., & nn. (?:&\ )?fn?\.?| # fn., & fn. ¶{1,2}| # ¶ §{1,2}| # § \*{1,4}| # * pg\.?| # pg. pp?\.? # p., pp. )\ ? # optional space after label )? (?: # page:paragraph cite, like 123:24-25 or 123:24-124:25: \d+:\d+(?:-\d+(?::\d+)?)?| # page range, like 12 or 12-13: \d+(?:-\d+)? ) """ PIN_CITE_REGEX = rf""" (?P # optional comma, space, "at" before pin cite ,?\ ?(?:at\ )? # first mandatory page number {PIN_CITE_TOKEN_REGEX} # optional additional page numbers (?:,\ ?{PIN_CITE_TOKEN_REGEX})* # pin cite must be followed by one of these so it doesn't capture # start of next citation (?= [,.;)\]\\]| # ending punctuation \ ?[(\[]| # space and start of parens $ # end of text ) ) """ # Law subsection regex: # Capture a single subsection like "(a)", "(1)", or "(viii)": LAW_SUBSECTION = r""" (?: \([0-9a-zA-Z]{1,4}\) ) """ # Law pin cite regex: # Capture pin cite immediately after a law section number. # Examples: # ...(a)(2) # ...(a)(2) and (d) # ... et seq. # We should also capture ranges like "123-124" here, but those are ambiguous # and are already captured as section numbers the same as "12-34-5". LAW_PIN_CITE_REGEX = rf""" (?P # subsection like (a)(1)(xiii): {LAW_SUBSECTION}* (?:\ and\ {LAW_SUBSECTION}+)? (?:\ et\ seq\.)? ) """ # Short cite antecedent regex: # What case does a short cite refer to? For now, we just capture the previous # word optionally followed by a comma. Example: Adarand, 515 U.S. at 241. SHORT_CITE_ANTECEDENT_REGEX = r""" (?P[\w\-.]+),? \ # final space """ # Supra cite antecedent regex: # What case does a short cite refer to? For now, we just capture the previous # word optionally followed by a comma. Example: Adarand, supra. # If the previous word is a digit, we capture both that (to store as a volume) # and the word before it (to store as antecedent). SUPRA_ANTECEDENT_REGEX = r""" (?: (?P[\w\-.]+),?\ (?P\d+)| (?P\d+)| (?P[\w\-.]+),? ) \ # final space """ # Post full citation regex: # Capture metadata after a full cite. For example given the citation "1 U.S. 1" # with the following text: # 1 U.S. 1, 4-5, 2 S. Ct. 2, 6-7 (4th Cir. 2012) (overruling foo) # we want to capture: # pin_cite = 4-5 # extra = 2 S. Ct. 2, 6-7 # court = 4th Cir. # year = 2012 # parenthetical = overruling foo POST_FULL_CITATION_REGEX = rf""" (?: # handle a full cite with a valid year paren: # content before year paren: (?: # pin cite with comma and extra: {PIN_CITE_REGEX}? ,?\ ? (?P[^(]*) ) # content within year paren: \((?: # court and year: (?P[^)]+)\ {YEAR_REGEX}| # just year: {YEAR_REGEX} )\) # optional parenthetical comment: {PARENTHETICAL_REGEX} | # handle a pin cite with no valid year paren: {PIN_CITE_REGEX} ) """ # Post short-form citation regex: # Capture pin cite and parenthetical after a short, id, or supra citation. # For example, given the citation 'asdf, 1 U.S., at 3 (overruling xyz)', # this will capture: # pin_cite = 3 # parenthetical = overruling xyz POST_SHORT_CITATION_REGEX = rf""" # optional pin cite {PIN_CITE_REGEX}? \ ? # optional parenthetical comment: {PARENTHETICAL_REGEX} """ # Post law citation regex: # statutory and regulatory cites may have publishers and dates after them, like # (West), (West 1999), (Lexis Jun. 2018), (1999), or (May 2, 1999), # and then may be followed by a parenthetical: POST_LAW_CITATION_REGEX = rf""" {LAW_PIN_CITE_REGEX}? \ ? (?:\( # Consol., McKinney, Deering, West, LexisNexis, etc. (?P [A-Z][a-z]+\.? (?:\ Supp\.)? )? \ ? # month (?:{MONTH_REGEX}\ )? # day (?P\d{{1,2}})?,?\ ? # four-digit year {YEAR_REGEX}? \))? \ ? # parenthetical {PARENTHETICAL_REGEX} """ # Post journal cite regex: # Journal cites may have a pin cite, then year, then parenthetical. POST_JOURNAL_CITATION_REGEX = rf""" {PIN_CITE_REGEX}? \ ? (?:\({YEAR_REGEX}\))? \ ? {PARENTHETICAL_REGEX} """