import re from collections import UserString from dataclasses import dataclass, field from datetime import datetime from typing import ( Any, Callable, Dict, Hashable, List, Optional, Sequence, Union, cast, ) from eyecite.utils import HashableDict ResourceType = Hashable @dataclass(eq=True, frozen=True) class Reporter: """Class for top-level reporters in `reporters_db`, like "S.W." """ short_name: str name: str cite_type: str source: str # one of "reporters", "laws", "journals" is_scotus: bool = False def __post_init__(self): if ( self.cite_type == "federal" and "supreme" in self.name.lower() ) or "scotus" in self.cite_type.lower(): # use setattr because this class is frozen object.__setattr__(self, "is_scotus", True) @dataclass(eq=True, frozen=True) class Edition: """Class for individual editions in `reporters_db`, like "S.W." and "S.W.2d".""" reporter: Reporter short_name: str start: Optional[datetime] end: Optional[datetime] def includes_year( self, year: int, ) -> bool: """Return True if edition contains cases for the given year.""" return ( year <= datetime.now().year and (self.start is None or self.start.year <= year) and (self.end is None or self.end.year >= year) ) @dataclass(eq=True, unsafe_hash=True) class CitationBase: """Base class for objects returned by `eyecite.find.get_citations`. We define several subclasses of this class below, representing the various types of citations that might exist.""" token: "Token" # token this citation came from index: int # index of _token in the token list # span() overrides span_start: Optional[int] = None span_end: Optional[int] = None groups: dict = field(default_factory=dict) metadata: Any = None def __post_init__(self): """Set up groups and metadata.""" # Allow groups to be used in comparisons: self.groups = HashableDict(self.token.groups) # Make metadata a self.Metadata object: self.metadata = ( self.Metadata(**self.metadata) if isinstance(self.metadata, dict) else self.Metadata() ) def __repr__(self): """Simplified repr() to be more readable than full dataclass repr(). Just shows 'FullCaseCitation("matched text", groups=...)'.""" return ( f"{self.__class__.__name__}(" + f"{repr(self.matched_text())}" + (f", groups={repr(self.groups)}" if self.groups else "") + f", metadata={repr(self.metadata)}" + ")" ) @dataclass(eq=True, unsafe_hash=True) class Metadata: """Define fields on self.metadata.""" parenthetical: Optional[str] = None def comparison_hash(self) -> int: """Return hash that will be the same if two cites are semantically equivalent.""" return hash((type(self), tuple(self.groups.items()))) def corrected_citation(self): """Return citation with any variations normalized.""" return self.matched_text() def corrected_citation_full(self): """Return citation with any variations normalized, including extracted metadata if any.""" return self.matched_text() def dump(self) -> dict: """Return citation data for printing by dump_citations.""" return { "groups": self.groups, "metadata": { k: v for k, v in self.metadata.__dict__.items() if v is not None }, } def matched_text(self): """Text that identified this citation, such as '1 U.S. 1' or 'Id.'""" return str(self.token) def span(self): """Start and stop offsets in source text for matched_text().""" return ( self.span_start if self.span_start is not None else self.token.start, self.span_end if self.span_end is not None else self.token.end, ) @dataclass(eq=True, unsafe_hash=True, repr=False) class ResourceCitation(CitationBase): """Base class for a case, law, or journal citation. Could be short or long.""" # Editions that might match this reporter string exact_editions: Sequence[Edition] = field(default_factory=tuple) variation_editions: Sequence[Edition] = field(default_factory=tuple) all_editions: Sequence[Edition] = field(default_factory=tuple) edition_guess: Optional[Edition] = None # year extracted from metadata["year"] and converted to int, # if in a valid range year: Optional[int] = None def __post_init__(self): """Make iterables into tuples to make sure we're hashable.""" self.exact_editions = tuple(self.exact_editions) self.variation_editions = tuple(self.variation_editions) self.all_editions = tuple(self.exact_editions) + tuple( self.variation_editions ) super().__post_init__() @dataclass(eq=True, unsafe_hash=True) class Metadata(CitationBase.Metadata): """Define fields on self.metadata.""" pin_cite: Optional[str] = None year: Optional[str] = None def comparison_hash(self) -> int: """Return hash that will be the same if two cites are semantically equivalent.""" return hash((super().comparison_hash(), self.all_editions)) def add_metadata(self, words: "Tokens"): """Extract metadata from text before and after citation.""" self.guess_edition() def dump(self) -> dict: """Return citation data for printing by dump_citations.""" return { **super().dump(), "year": self.year, } def corrected_reporter(self): """Get official reporter string from edition_guess, if possible.""" return ( self.edition_guess.short_name if self.edition_guess else self.groups["reporter"] ) def corrected_citation(self): """Return citation with corrected reporter.""" if self.edition_guess: return self.matched_text().replace( self.groups["reporter"], self.edition_guess.short_name ) return self.matched_text() def guess_edition(self): """Set edition_guess.""" # Use exact matches if possible, otherwise try variations editions = self.exact_editions or self.variation_editions if not editions: return # Attempt resolution by date if len(editions) > 1 and self.year: editions = [e for e in editions if e.includes_year(self.year)] if len(editions) == 1: self.edition_guess = editions[0] @dataclass(eq=True, unsafe_hash=True, repr=False) class FullCitation(ResourceCitation): """Abstract base class indicating that a citation fully identifies a resource.""" @dataclass(eq=True, unsafe_hash=True, repr=False) class FullLawCitation(FullCitation): """Citation to a source from `reporters_db/laws.json`.""" @dataclass(eq=True, unsafe_hash=True) class Metadata(FullCitation.Metadata): """Define fields on self.metadata.""" publisher: Optional[str] = None day: Optional[str] = None month: Optional[str] = None def add_metadata(self, words: "Tokens"): """Extract metadata from text before and after citation.""" # pylint: disable=import-outside-toplevel from eyecite.helpers import add_law_metadata add_law_metadata(self, words) super().add_metadata(words) def corrected_citation_full(self): """Return citation with any variations normalized, including extracted metadata if any.""" parts = [self.corrected_citation()] m = self.metadata if m.pin_cite: parts.append(f"{m.pin_cite}") publisher_date = " ".join( i for i in (m.publisher, m.month, m.day, m.year) if i ) if publisher_date: parts.append(f" ({publisher_date}") if m.parenthetical: parts.append(f" ({m.parenthetical})") return "".join(parts) @dataclass(eq=True, unsafe_hash=True, repr=False) class FullJournalCitation(FullCitation): """Citation to a source from `reporters_db/journals.json`.""" def add_metadata(self, words: "Tokens"): """Extract metadata from text before and after citation.""" # pylint: disable=import-outside-toplevel from eyecite.helpers import add_journal_metadata add_journal_metadata(self, words) super().add_metadata(words) def corrected_citation_full(self): """Return citation with any variations normalized, including extracted metadata if any.""" parts = [self.corrected_citation()] m = self.metadata if m.pin_cite: parts.append(f", {m.pin_cite}") if m.year: parts.append(f" ({m.year}") if m.parenthetical: parts.append(f" ({m.parenthetical})") return "".join(parts) @dataclass(eq=True, unsafe_hash=True, repr=False) class CaseCitation(ResourceCitation): """Convenience class which represents a single citation found in a document. """ @dataclass(eq=True, unsafe_hash=True) class Metadata(FullCitation.Metadata): """Define fields on self.metadata.""" # court is included for ShortCaseCitation as well. It won't appear in # the cite itself but can also be guessed from the reporter court: Optional[str] = None def guess_court(self): """Set court based on reporter.""" if not self.metadata.court and any( e.reporter.is_scotus for e in self.all_editions ): self.metadata.court = "scotus" @dataclass(eq=True, unsafe_hash=True, repr=False) class FullCaseCitation(CaseCitation, FullCitation): """Convenience class which represents a standard, fully named citation, i.e., the kind of citation that marks the first time a document is cited. Example: ``` Adarand Constructors, Inc. v. Peña, 515 U.S. 200, 240 ``` """ @dataclass(eq=True, unsafe_hash=True) class Metadata(CaseCitation.Metadata): """Define fields on self.metadata.""" plaintiff: Optional[str] = None defendant: Optional[str] = None extra: Optional[str] = None def add_metadata(self, words: "Tokens"): """Extract metadata from text before and after citation.""" # pylint: disable=import-outside-toplevel from eyecite.helpers import add_defendant, add_post_citation add_post_citation(self, words) add_defendant(self, words) self.guess_court() super().add_metadata(words) def corrected_citation_full(self): """Return formatted version of extracted cite.""" parts = [] m = self.metadata if m.plaintiff: parts.append(f"{m.plaintiff} v. ") if m.defendant: parts.append(f"{m.defendant}, ") parts.append(self.corrected_citation()) if m.pin_cite: parts.append(f", {m.pin_cite}") if m.extra: parts.append(m.extra) publisher_date = " ".join(m[i] for i in (m.court, m.year) if i) if publisher_date: parts.append(f" ({publisher_date}") if m.parenthetical: parts.append(f" ({m.parenthetical})") return "".join(parts) @dataclass(eq=True, unsafe_hash=True, repr=False) class ShortCaseCitation(CaseCitation): """Convenience class which represents a short form citation, i.e., the kind of citation made after a full citation has already appeared. This kind of citation lacks a full case name and usually has a different page number than the canonical citation. Examples: ``` Adarand, 515 U.S., at 241 Adarand, 515 U.S. at 241 515 U.S., at 241 ``` """ @dataclass(eq=True, unsafe_hash=True) class Metadata(CaseCitation.Metadata): """Define fields on self.metadata.""" antecedent_guess: Optional[str] = None def corrected_citation_full(self): """Return formatted version of extracted cite.""" parts = [] if self.metadata.antecedent_guess: parts.append(f"{self.metadata.antecedent_guess}, ") parts.append(self.corrected_citation()) return "".join(parts) @dataclass(eq=True, unsafe_hash=True, repr=False) class SupraCitation(CitationBase): """Convenience class which represents a 'supra' citation, i.e., a citation to something that is above in the document. Like a short form citation, this kind of citation lacks a full case name and usually has a different page number than the canonical citation. Examples: ``` Adarand, supra, at 240 Adarand, 515 supra, at 240 Adarand, supra, somethingelse Adarand, supra. somethingelse ``` """ @dataclass(eq=True, unsafe_hash=True) class Metadata(CitationBase.Metadata): """Define fields on self.metadata.""" antecedent_guess: Optional[str] = None pin_cite: Optional[str] = None volume: Optional[str] = None def formatted(self): """Return formatted version of extracted cite.""" parts = [] m = self.metadata if m.antecedent_guess: parts.append(f"{m.antecedent_guess}, ") if m.volume: parts.append(f"{m.volume} ") parts.append("supra") if m.pin_cite: parts.append(f", {m.pin_cite}") return "".join(parts) @dataclass(eq=True, unsafe_hash=True, repr=False) class IdCitation(CitationBase): """Convenience class which represents an 'id' or 'ibid' citation, i.e., a citation to the document referenced immediately prior. An 'id' citation is unlike a regular citation object since it has no knowledge of its reporter, volume, or page. Instead, the only helpful information that this reference possesses is a record of the pin cite after the 'id' token. Example: "... foo bar," id., at 240 """ @dataclass(eq=True, unsafe_hash=True) class Metadata(CitationBase.Metadata): """Define fields on self.metadata.""" pin_cite: Optional[str] = None def formatted(self): """Return formatted version of extracted cite.""" parts = ["id."] if self.metadata.pin_cite: parts.append(f", {self.metadata.pin_cite}") return "".join(parts) @dataclass(eq=True, unsafe_hash=True, repr=False) class UnknownCitation(CitationBase): """Convenience class which represents an unknown citation. A recognized citation should theoretically be parsed as a CaseCitation, FullLawCitation, or a FullJournalCitation. If it's something else, this class serves as a naive catch-all. """ def NonopinionCitation(*args, **kwargs): from warnings import warn warn( """NonopinionCitation will be deprecated in eyecite 2.5.0. Please use UnknownCitation instead.""", DeprecationWarning, ) return UnknownCitation(*args, **kwargs) @dataclass(eq=True, unsafe_hash=True) class Token(UserString): """Base class for special tokens. For performance, this isn't used for generic words.""" data: str start: int end: int groups: dict = field(default_factory=dict, compare=False) @classmethod def from_match(cls, m, extra, offset=0) -> "Token": """Return a token object based on a regular expression match. This gets called by TokenExtractor. By default, just use the entire matched string.""" start, end = m.span(1) # ignore "too many arguments" type error -- this is called # by subclasses with additional attributes return cls( # type: ignore[call-arg] m[1], start + offset, end + offset, groups=m.groupdict(), **extra ) def merge(self, other: "Token") -> Optional["Token"]: """Merge two tokens, by returning self if other is identical to self.""" if ( self.start == other.start and self.end == other.end and type(self) is type(other) and self.groups == other.groups ): return self return None # For performance, lists of tokens can include either Token subclasses # or bare strings (the typical case of words that aren't # related to citations) TokenOrStr = Union[Token, str] Tokens = List[TokenOrStr] @dataclass(eq=True, unsafe_hash=True) class CitationToken(Token): """String matching a citation regex from `reporters_db/reporters.json`.""" exact_editions: Sequence[Edition] = field(default_factory=tuple) variation_editions: Sequence[Edition] = field(default_factory=tuple) short: bool = False def __post_init__(self): """Make iterables into tuples to make sure we're hashable.""" self.exact_editions = tuple(self.exact_editions) self.variation_editions = tuple(self.variation_editions) def merge(self, other: "Token") -> Optional["Token"]: """To merge citation tokens, also make sure `short` matches, and combine their editions.""" merged = super().merge(other) if merged: other = cast(CitationToken, other) if self.short == other.short: self.exact_editions = cast(tuple, self.exact_editions) + cast( tuple, other.exact_editions ) self.variation_editions = cast( tuple, self.variation_editions ) + cast(tuple, other.variation_editions) return self return None @dataclass(eq=True, unsafe_hash=True) class SectionToken(Token): """Word containing a section symbol.""" @dataclass(eq=True, unsafe_hash=True) class SupraToken(Token): """Word matching "supra" with or without punctuation.""" @dataclass(eq=True, unsafe_hash=True) class IdToken(Token): """Word matching "id" or "ibid".""" @dataclass(eq=True, unsafe_hash=True) class ParagraphToken(Token): """Word matching a break between paragraphs.""" @dataclass(eq=True, unsafe_hash=True) class StopWordToken(Token): """Word matching one of the STOP_TOKENS.""" @dataclass class TokenExtractor: """Class for extracting all matches from a given string for the given regex, and then for returning Token objects for all matches.""" regex: str # constructor should be Callable[[re.Match, dict, int], Token] # but this issue makes it inconvenient to specify the input types: # https://github.com/python/mypy/issues/5485 constructor: Callable[..., Token] extra: Dict = field(default_factory=dict) flags: int = 0 strings: List = field(default_factory=list) def get_matches(self, text): """Return match objects for all matches in text.""" return self.compiled_regex.finditer(text) def get_token(self, m, offset=0) -> Token: """For a given match object, return a Token.""" return self.constructor(m, self.extra, offset) def __hash__(self): """This needs to be hashable so we can remove redundant extractors returned by the pyahocorasick filter.""" return hash(repr(self)) @property def compiled_regex(self): """Cache compiled regex as a property.""" if not hasattr(self, "_compiled_regex"): self._compiled_regex = re.compile(self.regex, flags=self.flags) return self._compiled_regex @dataclass(frozen=True) class Resource(ResourceType): """Thin resource class representing an object to which a citation can be resolved. See `eyecite.resolve` for more details.""" citation: FullCitation def __hash__(self): """Resources are the same if their citations are semantically equivalent.""" return self.citation.comparison_hash() def __eq__(self, other): return self.__hash__() == other.__hash__()