import re from collections import defaultdict from typing import Callable, Dict, List, Optional, Tuple, cast from eyecite.models import ( CitationBase, FullCaseCitation, FullCitation, IdCitation, Resource, ResourceType, ShortCaseCitation, SupraCitation, ) from eyecite.utils import strip_punct # type shorthand ResolvedFullCite = Tuple[FullCitation, ResourceType] ResolvedFullCites = List[ResolvedFullCite] Resolutions = Dict[ResourceType, List[CitationBase]] # Skip id. citations that imply a page length longer than this, # such as "1 U.S. 1. Id. at 200.": MAX_OPINION_PAGE_COUNT = 150 def resolve_full_citation(full_citation: FullCitation) -> Resource: """By default, resolve `eyecite.models.FullCaseCitation` objects to a generic (but reference-unique) `eyecite.models.Resource` object. This method is publicly documented because even if you override this method yourself with more sophisticated resolution logic, you may wish to still use this one as a fallback. For example, this could be one sensible pattern: >>> def my_resolve(full_cite): ... # special handling for resolution of known cases in a database ... resource = MyOpinion.objects.get(full_cite) ... if resource: ... return resource ... # allow normal clustering of other citations ... return resolve_full_citation(full_cite) >>> >>> resolve_citations(citations, resolve_full_citation=my_resolve) >>> >>> returns (pseudo): >>> { ... : [, , ], ... : [, ], >>> } Args: full_citation: A `eyecite.models.FullCitation` to resolve. Returns: The `eyecite.models.Resource` that the citation references. """ return Resource(full_citation) def _filter_by_matching_antecedent( resolved_full_cites: ResolvedFullCites, antecedent_guess: str, ) -> Optional[ResourceType]: matches: List[ResourceType] = [] ag: str = strip_punct(antecedent_guess) for full_citation, resource in resolved_full_cites: if not isinstance(full_citation, FullCaseCitation): continue if ( full_citation.metadata.defendant and ag in full_citation.metadata.defendant ): matches.append(resource) elif ( full_citation.metadata.plaintiff and ag in full_citation.metadata.plaintiff ): matches.append(resource) # Remove duplicates and only accept if one candidate remains matches = list(set(matches)) return matches[0] if len(matches) == 1 else None def _has_invalid_pin_cite( full_cite: FullCitation, id_cite: IdCitation ) -> bool: """Return True if id_cite has a pin cite that can't be correct for the given full_cite.""" # if no pin cite, we're fine if not id_cite.metadata.pin_cite: return False # if full cite has no page (such as a statute), we don't know what to # check, so assume we're fine if not full_cite.groups.get("page", "").isdigit(): return False # parse full cite page page = int(full_cite.groups["page"]) # parse short cite pin m = re.match(r"(?:at )?(\d+)", id_cite.metadata.pin_cite) if not m: # If pin cite doesn't start with a digit, assume it is invalid. # This is hopefully a conservative rule -- it will err for valid pin # cites like "Id. at *10", but successfully filter invalid pin cites # like "1 U.S. 1. ... Id. at ΒΆ 10". return True pin_cite = int(m[1]) # check page range if pin_cite < page or pin_cite > page + MAX_OPINION_PAGE_COUNT: return True return False def _resolve_shortcase_citation( short_citation: ShortCaseCitation, resolved_full_cites: ResolvedFullCites, ) -> Optional[ResourceType]: """ Try to match shortcase citations by checking whether their reporter and volume number matches those of any of the previously resolved full citations. If there are multiple possible matches, try to refine by also checking whether their antecedent_guess appears in either the defendant or plaintiff field of any of the previously resolved full citations. """ candidates: ResolvedFullCites = [] for full_citation, resource in resolved_full_cites: if ( isinstance(full_citation, FullCaseCitation) and short_citation.corrected_reporter() == full_citation.corrected_reporter() and short_citation.groups.get("volume") == full_citation.groups.get("volume") ): # Append both keys and values for further refinement below candidates.append((full_citation, resource)) # Remove duplicates and only accept if one candidate remains if len(set(resource for full_citation, resource in candidates)) == 1: return candidates[0][1] # Otherwise, if there is an antecedent guess, try to refine further elif short_citation.metadata.antecedent_guess: return _filter_by_matching_antecedent( candidates, short_citation.metadata.antecedent_guess ) # Otherwise, nothing left to try else: return None def _resolve_supra_citation( supra_citation: SupraCitation, resolved_full_cites: ResolvedFullCites, ) -> Optional[ResourceType]: """ Try to resolve supra citations by checking whether their antecedent_guess appears in either the defendant or plaintiff field of any of the previously resolved full citations. """ # If no guess, can't do anything if not supra_citation.metadata.antecedent_guess: return None return _filter_by_matching_antecedent( resolved_full_cites, supra_citation.metadata.antecedent_guess ) def _resolve_id_citation( id_citation: IdCitation, last_resolution: ResourceType, resolutions: Resolutions, ) -> Optional[ResourceType]: """ Resolve id citations to the resource of the previously resolved citation. """ # if last resolution failed, id. cite should also fail if not last_resolution: return None # filter out citations based on pin cite full_cite = cast(FullCitation, resolutions[last_resolution][0]) if _has_invalid_pin_cite(full_cite, id_citation): return None return last_resolution def resolve_citations( citations: List[CitationBase], resolve_full_citation: Callable[ [FullCitation], ResourceType ] = resolve_full_citation, resolve_shortcase_citation: Callable[ [ShortCaseCitation, ResolvedFullCites], Optional[ResourceType], ] = _resolve_shortcase_citation, resolve_supra_citation: Callable[ [SupraCitation, ResolvedFullCites], Optional[ResourceType], ] = _resolve_supra_citation, resolve_id_citation: Callable[ [IdCitation, ResourceType, Resolutions], Optional[ResourceType] ] = _resolve_id_citation, ) -> Resolutions: """Resolve a list of citations to their associated resources by matching each type of Citation object (FullCaseCitation, ShortCaseCitation, SupraCitation, and IdCitation) to a "resource" object. A "resource" could be a document, a URL, a database entry, etc. -- anything that conforms to the (non-prescriptive) requirements of the `eyecite.models.ResourceType` type. By default, eyecite uses an extremely thin "resource" object that simply serves as a conceptual way to group citations with the same references together. This function assumes that the given list of citations is ordered in the order that they were extracted from the text (i.e., assumes that supra citations and id citations can only refer to previous references). It returns a dict in the following format: ``` keys = resources values = lists of citations ``` The individual resolution steps can be supplanted with more complex logic by passing custom functions (e.g., if you have a thicker resource abstraction that you want to use); the default approach is to use simple heuristics to narrow down the set of possible resolutions. If a citation cannot be definitively resolved to a resource, it is dropped and not resolved. Args: citations: A list of `eyecite.models.CitationBase` objects, returned from calling `eyecite.find.get_citations`. resolve_full_citation: A function that resolves `eyecite.models.FullCitation` objects to resources. resolve_shortcase_citation: A function that resolves `eyecite.models.ShortCaseCitation` objects to resources. resolve_supra_citation: A function that resolves `eyecite.models.SupraCitation` objects to resources. resolve_id_citation: A function that resolves `eyecite.models.IdCitation` objects to resources. Returns: A dictionary mapping `eyecite.models.ResourceType` objects (the keys) to lists of `eyecite.models.CitationBase` objects (the values). """ # Dict of all citation resolutions resolutions: Resolutions = defaultdict(list) # Dict mapping full citations to their resolved resources resolved_full_cites: ResolvedFullCites = [] # The resource of the most recently resolved citation, if any last_resolution: Optional[ResourceType] = None # Iterate over each citation and attempt to resolve it to a resource for citation in citations: # If the citation is a full citation, try to resolve it if isinstance(citation, FullCitation): resolution = resolve_full_citation(citation) resolved_full_cites.append((citation, resolution)) # If the citation is a short case citation, try to resolve it elif isinstance(citation, ShortCaseCitation): resolution = resolve_shortcase_citation( citation, resolved_full_cites ) # If the citation is a supra citation, try to resolve it elif isinstance(citation, SupraCitation): resolution = resolve_supra_citation(citation, resolved_full_cites) # If the citation is an id citation, try to resolve it elif isinstance(citation, IdCitation): resolution = resolve_id_citation( citation, last_resolution, resolutions ) # If the citation is to an unknown document, ignore for now else: resolution = None last_resolution = resolution if resolution: # Record the citation in the appropriate list resolutions[resolution].append(citation) return resolutions