from pathlib import Path from unittest import TestCase from eyecite import annotate_citations, clean_text, get_citations class AnnotateTest(TestCase): def test_annotate(self): def straighten_quotes(text): return text.replace("’", "'") def lower_annotator(before, text, after): return before + text.lower() + after test_pairs = ( # single cite ("1 U.S. 1", "<0>1 U.S. 1", []), # cite with extra text ("foo 1 U.S. 1 bar", "foo <0>1 U.S. 1 bar", []), # cite with punctuation ("foo '1 U.S. 1' bar", "foo '<0>1 U.S. 1' bar", []), # law cite ( "foo. Mass. Gen. Laws ch. 1, § 2. bar", "foo. <0>Mass. Gen. Laws ch. 1, § 2. bar", [], ), # journal cite ( "foo. 1 Minn. L. Rev. 2. bar", "foo. <0>1 Minn. L. Rev. 2. bar", [], ), # Id. cite ( "1 U.S. 1. Foo. Id. Bar. Id. at 2.", "<0>1 U.S. 1. Foo. <1>Id. Bar. <2>Id. at 2.", [], ), # Supra cite ( "1 U.S. 1. Foo v. Bar, supra at 2.", "<0>1 U.S. 1. Foo v. Bar, <1>supra at 2.", [], ), # whitespace and html -- no unbalanced tag check ( "foo 1 U.S. 1 bar", "foo <0>1 U.S. 1 bar", ["html", "inline_whitespace"], ), # whitespace and html -- skip unbalanced tags ( "foo 1 U.S. 1; 2 U.S. 2", "foo 1 U.S. 1; <1>2 U.S. 2", ["html", "inline_whitespace"], {"unbalanced_tags": "skip"}, ), # whitespace and html -- wrap unbalanced tags ( "1 U.S. 1; 2 U.S. 2", "<0>1 U.S.<0> 1; <1>2 U.S. 2", ["html", "inline_whitespace"], {"unbalanced_tags": "wrap"}, ), # tighly-wrapped html -- skip unbalanced tags (issue #54) ( "foo Ibid. bar", "foo <0>Ibid. bar", ["html", "inline_whitespace"], {"unbalanced_tags": "skip"}, ), # whitespace containing linebreaks ("1\nU.S. 1", "<0>1\nU.S. 1", ["all_whitespace"]), # multiple Id. tags ( "1 U.S. 1. Id. 2 U.S. 2. Id.", "<0>1 U.S. 1. <1>Id. <2>2 U.S. 2. <3>Id.", [], ), # replacement in cleaners ( "1 Abbott’s Pr.Rep. 1", "<0>1 Abbott’s Pr.Rep. 1", [straighten_quotes], ), # custom annotator ( "1 U.S. 1", "<0>1 u.s. 1", [], {"annotator": lower_annotator}, ), ) for source_text, expected, clean_steps, *annotate_kwargs in test_pairs: annotate_kwargs = annotate_kwargs[0] if annotate_kwargs else {} with self.subTest( source_text, clean_steps=clean_steps, annotate_args=annotate_kwargs, ): plain_text = clean_text(source_text, clean_steps) cites = get_citations(plain_text) annotations = [ (c.span(), f"<{i}>", f"") for i, c in enumerate(cites) ] annotated = annotate_citations( plain_text, annotations, source_text=source_text, **annotate_kwargs, ) self.assertEqual(annotated, expected) def test_long_diff(self): """Does diffing work across a long text with many changes?""" opinion_text = ( Path(__file__).parent / "assets" / "opinion.txt" ).read_text() cleaned_text = clean_text(opinion_text, ["all_whitespace"]) annotated_text = annotate_citations( cleaned_text, [((902, 915), "~FOO~", "~BAR~")], opinion_text ) self.assertIn("~FOO~539\n U. S. 306~BAR~", annotated_text)