import os from copy import copy from datetime import datetime from unittest import TestCase from eyecite import clean_text, get_citations # by default tests use a cache for speed # call tests with `EYECITE_CACHE_DIR= python ...` to disable cache from eyecite.models import ResourceCitation from eyecite.test_factories import ( case_citation, id_citation, journal_citation, law_citation, supra_citation, unknown_citation, ) from eyecite.tokenizers import ( EDITIONS_LOOKUP, EXTRACTORS, AhocorasickTokenizer, HyperscanTokenizer, Tokenizer, ) cache_dir = os.environ.get("EYECITE_CACHE_DIR", ".test_cache") or None tested_tokenizers = [ Tokenizer(), AhocorasickTokenizer(), HyperscanTokenizer(cache_dir=cache_dir), ] class FindTest(TestCase): maxDiff = None def run_test_pairs(self, test_pairs, message, tokenizers=None): def get_comparison_attrs(cite): out = { "groups": cite.groups, "metadata": cite.metadata, } if isinstance(cite, ResourceCitation): out["year"] = cite.year out["corrected_reporter"] = cite.corrected_reporter() return out if tokenizers is None: tokenizers = tested_tokenizers for q, expected_cites, *kwargs in test_pairs: kwargs = kwargs[0] if kwargs else {} clean_steps = kwargs.pop("clean", []) clean_q = clean_text(q, clean_steps) for tokenizer in tokenizers: with self.subTest( message, tokenizer=type(tokenizer).__name__, q=q ): cites_found = get_citations( clean_q, tokenizer=tokenizer, **kwargs ) self.assertEqual( [type(i) for i in cites_found], [type(i) for i in expected_cites], f"Extracted cite count doesn't match for {repr(q)}", ) for a, b in zip(cites_found, expected_cites): found_attrs = get_comparison_attrs(a) expected_attrs = get_comparison_attrs(b) self.assertEqual( found_attrs, expected_attrs, f"Extracted cite attrs don't match for {repr(q)}", ) def test_find_citations(self): """Can we find and make citation objects from strings?""" # fmt: off test_pairs = ( # Basic test ('1 U.S. 1', [case_citation()]), # Basic test with a line break ('1 U.S.\n1', [case_citation()], {'clean': ['all_whitespace']}), # Basic test with a line break within a reporter ('1 U.\nS. 1', [case_citation(reporter_found='U. S.')], {'clean': ['all_whitespace']}), # Basic test of non-case name before citation (should not be found) ('lissner test 1 U.S. 1', [case_citation()]), # Test with plaintiff and defendant ('lissner v. test 1 U.S. 1', [case_citation(metadata={'plaintiff': 'lissner', 'defendant': 'test'})]), # Test with plaintiff, defendant and year ('lissner v. test 1 U.S. 1 (1982)', [case_citation(metadata={'plaintiff': 'lissner', 'defendant': 'test'}, year=1982)]), # Don't choke on misformatted year ('lissner v. test 1 U.S. 1 (198⁴)', [case_citation(metadata={'plaintiff': 'lissner', 'defendant': 'test'})]), # Test with different reporter than all of above. ('bob lissner v. test 1 F.2d 1 (1982)', [case_citation(reporter='F.2d', year=1982, metadata={'plaintiff': 'lissner', 'defendant': 'test'})]), # Test with comma after defendant's name ('lissner v. test, 1 U.S. 1 (1982)', [case_citation(metadata={'plaintiff': 'lissner', 'defendant': 'test'}, year=1982)]), # Test with court and extra information ('bob lissner v. test 1 U.S. 12, 347-348 (4th Cir. 1982)', [case_citation(page='12', year=1982, metadata={'plaintiff': 'lissner', 'defendant': 'test', 'court': 'ca4', 'pin_cite': '347-348'})]), # Parallel cite with parenthetical ('bob lissner v. test 1 U.S. 12, 347-348, 1 S. Ct. 2, 358 (4th Cir. 1982) (overruling foo)', [case_citation(page='12', year=1982, metadata={'plaintiff': 'lissner', 'defendant': 'test', 'court': 'ca4', 'pin_cite': '347-348', 'extra': "1 S. Ct. 2, 358", 'parenthetical': 'overruling foo'}), case_citation(page='2', reporter='S. Ct.', year=1982, metadata={'plaintiff': 'lissner', 'defendant': 'test 1 U.S. 12, 347-348', 'court': 'ca4', 'pin_cite': '358', 'parenthetical': 'overruling foo'}), ]), # Test full citation with nested parenthetical ('lissner v. test 1 U.S. 1 (1982) (discussing abc (Holmes, J., concurring))', [case_citation(metadata={'plaintiff': 'lissner', 'defendant': 'test', 'parenthetical': 'discussing abc (Holmes, J., concurring)'}, year=1982)]), # Test full citation with parenthetical and subsequent unrelated parenthetical ('lissner v. test 1 U.S. 1 (1982) (discussing abc); blah (something).', [case_citation(metadata={'plaintiff': 'lissner', 'defendant': 'test', 'parenthetical': 'discussing abc'}, year=1982)]), # Test with text before and after and a variant reporter ('asfd 22 U. S. 332 (1975) asdf', [case_citation(page='332', volume='22', reporter_found='U. S.', year=1975)]), # Test with finding reporter when it's a second edition ('asdf 22 A.2d 332 asdf', [case_citation(page='332', reporter='A.2d', volume='22')]), # Test if reporter in string will find proper citation string ('A.2d 332 11 A.2d 333', [case_citation(page='333', reporter='A.2d', volume='11')]), # Test finding a variant second edition reporter ('asdf 22 A. 2d 332 asdf', [case_citation(page='332', reporter='A.2d', volume='22', reporter_found='A. 2d')]), # Test finding a variant of an edition resolvable by variant alone. ('171 Wn.2d 1016', [case_citation(page='1016', reporter='Wash. 2d', volume='171', reporter_found='Wn.2d')]), # Test finding two citations where one of them has abutting # punctuation. ('2 U.S. 3, 4-5 (3 Atl. 33)', [case_citation(page='3', volume='2', metadata={'pin_cite': '4-5'}), case_citation(page='33', reporter="A.", volume='3', reporter_found="Atl.")]), # Test with the page number as a Roman numeral ('12 Neb. App. lxiv (2004)', [case_citation(page='lxiv', reporter='Neb. Ct. App.', volume='12', reporter_found='Neb. App.', year=2004)]), # Test with page range with a weird suffix ('559 N.W.2d 826|N.D.', [case_citation(page='826', reporter='N.W.2d', volume='559')]), # Test with malformed/missing page number ('1 U.S. f24601', []), # Test with the 'digit-REPORTER-digit' corner-case formatting ('2007-NMCERT-008', [case_citation(source_text='2007-NMCERT-008', page='008', reporter='NMCERT', volume='2007')]), ('2006-Ohio-2095', [case_citation(source_text='2006-Ohio-2095', page='2095', reporter='Ohio', volume='2006')]), ('2017 IL App (4th) 160407', [case_citation(page='160407', reporter='IL App (4th)', volume='2017')]), ('2017 IL App (1st) 143684-B', [case_citation(page='143684-B', reporter='IL App (1st)', volume='2017')]), # Test first kind of short form citation (meaningless antecedent) ('before asdf 1 U. S., at 2', [case_citation(page='2', reporter_found='U. S.', short=True, metadata={'antecedent_guess': 'asdf'})]), # Test second kind of short form citation (meaningful antecedent) ('before asdf, 1 U. S., at 2', [case_citation(page='2', reporter='U.S.', reporter_found='U. S.', short=True, metadata={'antecedent_guess': 'asdf'})]), # Test short form citation with preceding ASCII quotation ('before asdf,” 1 U. S., at 2', [case_citation(page='2', reporter_found='U. S.', short=True)]), # Test short form citation when case name looks like a reporter ('before Johnson, 1 U. S., at 2', [case_citation(page='2', reporter_found='U. S.', short=True, metadata={'antecedent_guess': 'Johnson'})]), # Test short form citation with no comma after reporter ('before asdf, 1 U. S. at 2', [case_citation(page='2', reporter='U.S.', reporter_found='U. S.', short=True, metadata={'antecedent_guess': 'asdf'})]), # Test short form citation at end of document (issue #1171) ('before asdf, 1 U. S. end', []), # Test supra citation across line break ('before asdf, supra,\nat 2', [supra_citation("supra,", metadata={'pin_cite': 'at 2', 'antecedent_guess': 'asdf'})], {'clean': ['all_whitespace']}), # Test short form citation with a page range ('before asdf, 1 U. S., at 20-25', [case_citation(page='20', reporter_found='U. S.', short=True, metadata={'pin_cite': '20-25', 'antecedent_guess': 'asdf'})]), # Test short form citation with a page range with weird suffix ('before asdf, 1 U. S., at 20-25\\& n. 4', [case_citation(page='20', reporter_found='U. S.', short=True, metadata={'pin_cite': '20-25', 'antecedent_guess': 'asdf'})]), # Test short form citation with a parenthetical ('before asdf, 1 U. S., at 2 (overruling xyz)', [case_citation(page='2', reporter='U.S.', reporter_found='U. S.', short=True, metadata={'antecedent_guess': 'asdf', 'parenthetical': 'overruling xyz'} )]), # Test short form citation with no space before parenthetical ('before asdf, 1 U. S., at 2(overruling xyz)', [case_citation(page='2', reporter='U.S.', reporter_found='U. S.', short=True, metadata={'antecedent_guess': 'asdf', 'parenthetical': 'overruling xyz'} )]), # Test short form citation with nested parentheticals ('before asdf, 1 U. S., at 2 (discussing xyz (Holmes, J., concurring))', [case_citation(page='2', reporter='U.S.', reporter_found='U. S.', short=True, metadata={'antecedent_guess': 'asdf', 'parenthetical': 'discussing xyz (Holmes, J., concurring)'} )]), # Test that short form citation doesn't treat year as parenthetical ('before asdf, 1 U. S., at 2 (2016)', [case_citation(page='2', reporter='U.S.', reporter_found='U. S.', short=True, metadata={'antecedent_guess': 'asdf'} )]), # Test short form citation with page range and parenthetical ('before asdf, 1 U. S., at 20-25 (overruling xyz)', [case_citation(page='20', reporter='U.S.', reporter_found='U. S.', short=True, metadata={'antecedent_guess': 'asdf', 'pin_cite': '20-25', 'parenthetical': 'overruling xyz'} )]), # Test short form citation with subsequent unrelated parenthetical ('asdf, 1 U. S., at 4 (discussing abc). Some other nonsense (clarifying nonsense)', [case_citation(page='4', reporter='U.S.', reporter_found='U. S.', short=True, metadata={'antecedent_guess': 'asdf', 'parenthetical': 'discussing abc'} )] ), # Test short form citation generated from non-standard regex for full cite ('1 Mich. at 1', [case_citation(reporter='Mich.', short=True)]), # Test parenthetical matching with multiple citations ('1 U. S., at 2. foo v. bar 3 U. S. 4 (2010) (overruling xyz).', [case_citation(page='2', reporter='U.S.', reporter_found='U. S.', short=True, volume='1', metadata={'pin_cite': '2'}), case_citation(page='4', reporter='U.S.', reporter_found='U. S.', short=False, year=2010, volume='3', metadata={'parenthetical': 'overruling xyz', 'plaintiff': 'foo', 'defendant': 'bar'}) ]), # Test with multiple citations and parentheticals ('1 U. S., at 2 (criticizing xyz). foo v. bar 3 U. S. 4 (2010) (overruling xyz).', [case_citation(page='2', reporter='U.S.', reporter_found='U. S.', short=True, volume='1', metadata={'pin_cite': '2', 'parenthetical': 'criticizing xyz'}), case_citation(page='4', reporter='U.S.', reporter_found='U. S.', short=False, year=2010, volume='3', metadata={'parenthetical': 'overruling xyz', 'plaintiff': 'foo', 'defendant': 'bar'}) ]), # Test first kind of supra citation (standard kind) ('before asdf, supra, at 2', [supra_citation("supra,", metadata={'pin_cite': 'at 2', 'antecedent_guess': 'asdf'})]), # Test second kind of supra citation (with volume) ('before asdf, 123 supra, at 2', [supra_citation("supra,", metadata={'pin_cite': 'at 2', 'volume': '123', 'antecedent_guess': 'asdf'})]), # Test third kind of supra citation (sans page) ('before asdf, supra, foo bar', [supra_citation("supra,", metadata={'antecedent_guess': 'asdf'})]), # Test third kind of supra citation (with period) ('before asdf, supra. foo bar', [supra_citation("supra,", metadata={'antecedent_guess': 'asdf'})]), # Test supra citation at end of document (issue #1171) ('before asdf, supra end', [supra_citation("supra,", metadata={'antecedent_guess': 'asdf'})]), # Supra with parenthetical ('Foo, supra (overruling ...) (ignore this)', [supra_citation("supra", metadata={'antecedent_guess': 'Foo', 'parenthetical': 'overruling ...'})]), ('Foo, supra, at 2 (overruling ...)', [supra_citation("supra", metadata={'antecedent_guess': 'Foo', 'pin_cite': 'at 2', 'parenthetical': 'overruling ...'})]), # Test Ibid. citation ('foo v. bar 1 U.S. 12. asdf. Ibid. foo bar lorem ipsum.', [case_citation(page='12', metadata={'plaintiff': 'foo', 'defendant': 'bar'}), id_citation('Ibid.')]), # Test italicized Ibid. citation ('

before asdf. Ibid.

foo bar lorem

', [id_citation('Ibid.')], {'clean': ['html', 'inline_whitespace']}), # Test Id. citation ('foo v. bar 1 U.S. 12, 347-348. asdf. Id., at 123. foo bar', [case_citation(page='12', metadata={'plaintiff': 'foo', 'defendant': 'bar', 'pin_cite': '347-348'}), id_citation('Id.,', metadata={'pin_cite': 'at 123'})]), # Test Id. citation across line break ('foo v. bar 1 U.S. 12, 347-348. asdf. Id.,\nat 123. foo bar', [case_citation(page='12', metadata={'plaintiff': 'foo', 'defendant': 'bar', 'pin_cite': '347-348'}), id_citation('Id.,', metadata={'pin_cite': 'at 123'})], {'clean': ['all_whitespace']}), # Test italicized Id. citation ('

before asdf. Id., at 123.

foo bar

', [id_citation('Id.,', metadata={'pin_cite': 'at 123'})], {'clean': ['html', 'inline_whitespace']}), # Test italicized Id. citation with another HTML tag in the way ('

before asdf. Id., at 123.

foo bar

', [id_citation('Id.,', metadata={'pin_cite': 'at 123'})], {'clean': ['html', 'inline_whitespace']}), # Test weirder Id. citations (#1344) ('foo v. bar 1 U.S. 12, 347-348. asdf. Id. ¶ 34. foo bar', [case_citation(page='12', metadata={'plaintiff': 'foo', 'defendant': 'bar', 'pin_cite': '347-348'}), id_citation('Id.', metadata={'pin_cite': '¶ 34'})]), ('foo v. bar 1 U.S. 12, 347-348. asdf. Id. at 62-63, 67-68. f b', [case_citation(page='12', metadata={'plaintiff': 'foo', 'defendant': 'bar', 'pin_cite': '347-348'}), id_citation('Id.', metadata={'pin_cite': 'at 62-63, 67-68'})]), ('foo v. bar 1 U.S. 12, 347-348. asdf. Id., at *10. foo bar', [case_citation(page='12', metadata={'plaintiff': 'foo', 'defendant': 'bar', 'pin_cite': '347-348'}), id_citation('Id.,', metadata={'pin_cite': 'at *10'})]), ('foo v. bar 1 U.S. 12, 347-348. asdf. Id. at 7-9, ¶¶ 38-53. f b', [case_citation(page='12', metadata={'plaintiff': 'foo', 'defendant': 'bar', 'pin_cite': '347-348'}), id_citation('Id.', metadata={'pin_cite': 'at 7-9, ¶¶ 38-53'})]), ('foo v. bar 1 U.S. 12, 347-348. asdf. Id. at pp. 45, 64. foo bar', [case_citation(page='12', metadata={'plaintiff': 'foo', 'defendant': 'bar', 'pin_cite': '347-348'}), id_citation('Id.', metadata={'pin_cite': 'at pp. 45, 64'})]), ('foo v. bar 1 U.S. 12, 347-348. asdf. id. 119:12-14. foo bar', [case_citation(page='12', metadata={'plaintiff': 'foo', 'defendant': 'bar', 'pin_cite': '347-348'}), id_citation('id.', metadata={'pin_cite': '119:12-14'})]), # Test Id. citation without page number ('foo v. bar 1 U.S. 12, 347-348. asdf. Id. No page number.', [case_citation(page='12', metadata={'plaintiff': 'foo', 'defendant': 'bar', 'pin_cite': '347-348'}), id_citation('Id.')]), # Id. with parenthetical ('Id. (overruling ...) (ignore this)', [id_citation("Id.", metadata={'parenthetical': 'overruling ...'})]), ('Id. at 2 (overruling ...)', [id_citation("Id.", metadata={'pin_cite': 'at 2', 'parenthetical': 'overruling ...'})]), # Test unknown citation ('lorem ipsum see §99 of the U.S. code.', [unknown_citation('§99')]), # Test address that's not a citation (#1338) ('lorem 111 S.W. 12th St.', [],), ('lorem 111 N. W. 12th St.', [],), # Test Conn. Super. Ct. regex variation. ('Failed to recognize 1993 Conn. Super. Ct. 5243-P', [case_citation(volume='1993', reporter='Conn. Super. Ct.', page='5243-P')]), # Test that the tokenizer handles commas after a reporter. In the # past, " U. S. " would match but not " U. S., " ('foo 1 U.S., 1 bar', [case_citation()]), # Test reporter with custom regex ('blah blah Bankr. L. Rep. (CCH) P12,345. blah blah', [case_citation(volume=None, reporter='Bankr. L. Rep.', reporter_found='Bankr. L. Rep. (CCH)', page='12,345')]), ('blah blah, 2009 12345 (La.App. 1 Cir. 05/10/10). blah blah', [case_citation(volume='2009', reporter='La.App. 1 Cir.', page='12345', groups={'date_filed': '05/10/10'})]), # Token scanning edge case -- incomplete paren at end of input ('1 U.S. 1 (', [case_citation()]), # Token scanning edge case -- missing plaintiff name at start of input ('v. Bar, 1 U.S. 1', [case_citation(metadata={'defendant': 'Bar'})]), # Token scanning edge case -- short form start of input ('1 U.S., at 1', [case_citation(short=True)]), (', 1 U.S., at 1', [case_citation(short=True)]), # Token scanning edge case -- supra at start of input ('supra.', [supra_citation("supra.")]), (', supra.', [supra_citation("supra.")]), ('123 supra.', [supra_citation("supra.", metadata={'volume': "123"})]), # Token scanning edge case -- Id. at end of input ('Id.', [id_citation('Id.,')]), ('Id. at 1.', [id_citation('Id.,', metadata={'pin_cite': 'at 1'})]), ('Id. foo', [id_citation('Id.,')]), # Reject citations that are part of larger words ('foo1 U.S. 1, 1. U.S. 1foo', [],), # Long pin cite -- make sure no catastrophic backtracking in regex ('1 U.S. 1, 2277, 2278, 2279, 2280, 2281, 2282, 2283, 2284, 2286, 2287, 2288, 2289, 2290, 2291', [case_citation(metadata={'pin_cite': '2277, 2278, 2279, 2280, 2281, 2282, 2283, 2284, 2286, 2287, 2288, 2289, 2290, 2291'})]), ) # fmt: on self.run_test_pairs(test_pairs, "Citation extraction") def test_find_law_citations(self): """Can we find citations from laws.json?""" # fmt: off """ see Ariz. Rev. Stat. Ann. § 36-3701 et seq. (West 2009) 63 Stat. 687 (emphasis added) 18 U. S. C. §§4241-4243 Fla. Stat. § 120.68 (2007) """ test_pairs = ( # Basic test ('Mass. Gen. Laws ch. 1, § 2', [law_citation('Mass. Gen. Laws ch. 1, § 2', reporter='Mass. Gen. Laws', groups={'chapter': '1', 'section': '2'})]), ('1 Stat. 2', [law_citation('1 Stat. 2', reporter='Stat.', groups={'volume': '1', 'page': '2'})]), # year ('Fla. Stat. § 120.68 (2007)', [law_citation('Fla. Stat. § 120.68 (2007)', reporter='Fla. Stat.', year=2007, groups={'section': '120.68'})]), # et seq, publisher, year ('Ariz. Rev. Stat. Ann. § 36-3701 et seq. (West 2009)', [law_citation('Ariz. Rev. Stat. Ann. § 36-3701 et seq. (West 2009)', reporter='Ariz. Rev. Stat. Ann.', metadata={'pin_cite': 'et seq.', 'publisher': 'West'}, groups={'section': '36-3701'}, year=2009)]), # multiple sections ('Mass. Gen. Laws ch. 1, §§ 2-3', [law_citation('Mass. Gen. Laws ch. 1, §§ 2-3', reporter='Mass. Gen. Laws', groups={'chapter': '1', 'section': '2-3'})]), # parenthetical ('Kan. Stat. Ann. § 21-3516(a)(2) (repealed) (ignore this)', [law_citation('Kan. Stat. Ann. § 21-3516(a)(2) (repealed)', reporter='Kan. Stat. Ann.', metadata={'pin_cite': '(a)(2)', 'parenthetical': 'repealed'}, groups={'section': '21-3516'})]), # Supp. publisher ('Ohio Rev. Code Ann. § 5739.02(B)(7) (Lexis Supp. 2010)', [law_citation('Ohio Rev. Code Ann. § 5739.02(B)(7) (Lexis Supp. 2010)', reporter='Ohio Rev. Code Ann.', metadata={'pin_cite': '(B)(7)', 'publisher': 'Lexis Supp.'}, groups={'section': '5739.02'}, year=2010)]), # Year range ('Wis. Stat. § 655.002(2)(c) (2005-06)', [law_citation('Wis. Stat. § 655.002(2)(c) (2005-06)', reporter='Wis. Stat.', metadata={'pin_cite': '(2)(c)'}, groups={'section': '655.002'}, year=2005)]), # 'and' pin cite ('Ark. Code Ann. § 23-3-119(a)(2) and (d) (1987)', [law_citation('Ark. Code Ann. § 23-3-119(a)(2) and (d) (1987)', reporter='Ark. Code Ann.', metadata={'pin_cite': '(a)(2) and (d)'}, groups={'section': '23-3-119'}, year=1987)]), # Cite to multiple sections ('Mass. Gen. Laws ch. 1, §§ 2-3', [law_citation('Mass. Gen. Laws ch. 1, §§ 2-3', reporter='Mass. Gen. Laws', groups={'chapter': '1', 'section': '2-3'})]), ) # fmt: on self.run_test_pairs(test_pairs, "Law citation extraction") def test_find_journal_citations(self): """Can we find citations from journals.json?""" # fmt: off test_pairs = ( # Basic test ('1 Minn. L. Rev. 1', [journal_citation()]), # Pin cite ('1 Minn. L. Rev. 1, 2-3', [journal_citation(metadata={'pin_cite': '2-3'})]), # Year ('1 Minn. L. Rev. 1 (2007)', [journal_citation(year=2007)]), # Pin cite and year ('1 Minn. L. Rev. 1, 2-3 (2007)', [journal_citation(metadata={'pin_cite': '2-3'}, year=2007)]), # Pin cite and year and parenthetical ('1 Minn. L. Rev. 1, 2-3 (2007) (discussing ...) (ignore this)', [journal_citation(year=2007, metadata={'pin_cite': '2-3', 'parenthetical': 'discussing ...'})]), # Year range ('77 Marq. L. Rev. 475 (1993-94)', [journal_citation(volume='77', reporter='Marq. L. Rev.', page='475', year=1993)]), ) # fmt: on self.run_test_pairs(test_pairs, "Journal citation extraction") def test_find_tc_citations(self): """Can we parse tax court citations properly?""" # fmt: off test_pairs = ( # Test with atypical formatting for Tax Court Memos ('the 1 T.C. No. 233', [case_citation(page='233', reporter='T.C. No.')]), ('word T.C. Memo. 2019-233', [case_citation('T.C. Memo. 2019-233', page='233', reporter='T.C. Memo.', volume='2019')]), ('something T.C. Summary Opinion 2019-233', [case_citation('T.C. Summary Opinion 2019-233', page='233', reporter='T.C. Summary Opinion', volume='2019')]), ('T.C. Summary Opinion 2018-133', [case_citation('T.C. Summary Opinion 2018-133', page='133', reporter='T.C. Summary Opinion', volume='2018')]), ('U.S. 1234 1 U.S. 1', [case_citation(volume='1', reporter='U.S.', page='1')]), ) # fmt: on self.run_test_pairs(test_pairs, "Tax court citation extraction") def test_date_in_editions(self): test_pairs = [ (EDITIONS_LOOKUP["S.E."], 1886, False), (EDITIONS_LOOKUP["S.E."], 1887, True), (EDITIONS_LOOKUP["S.E."], 1940, False), (EDITIONS_LOOKUP["S.E.2d"], 1940, True), (EDITIONS_LOOKUP["S.E.2d"], 2012, True), (EDITIONS_LOOKUP["T.C.M."], 1950, True), (EDITIONS_LOOKUP["T.C.M."], 1940, False), (EDITIONS_LOOKUP["T.C.M."], + 1, False), ] for edition, year, expected in test_pairs: date_in_reporter = edition[0].includes_year(year) self.assertEqual( date_in_reporter, expected, msg="is_date_in_reporter(%s, %s) != " "%s\nIt's equal to: %s" % (edition[0], year, expected, date_in_reporter), ) def test_disambiguate_citations(self): # fmt: off test_pairs = [ # 1. P.R.R --> Correct abbreviation for a reporter. ('1 P.R.R. 1', [case_citation(reporter='P.R.R.')]), # 2. U. S. --> A simple variant to resolve. ('1 U. S. 1', [case_citation(reporter_found='U. S.')]), # 3. A.2d --> Not a variant, but needs to be looked up in the # EDITIONS variable. ('1 A.2d 1', [case_citation(reporter='A.2d')]), # 4. A. 2d --> An unambiguous variant of an edition ('1 A. 2d 1', [case_citation(reporter='A.2d', reporter_found='A. 2d')]), # 5. P.R. --> A variant of 'Pen. & W.', 'P.R.R.', or 'P.' that's # resolvable by year ('1 P.R. 1 (1831)', # Of the three, only Pen & W. was being published this year. [case_citation(reporter='Pen. & W.', year=1831, reporter_found='P.R.')]), # 5.1: W.2d --> A variant of an edition that either resolves to # 'Wis. 2d' or 'Wash. 2d' and is resolvable by year. ('1 W.2d 1 (1854)', # Of the two, only Wis. 2d was being published this year. [case_citation(reporter='Wis. 2d', year=1854, reporter_found='W.2d')]), # 5.2: Wash. --> A non-variant that has more than one reporter for # the key, but is resolvable by year ('1 Wash. 1 (1890)', [case_citation(reporter='Wash.', year=1890)]), # 6. Cr. --> A variant of Cranch, which is ambiguous, except with # paired with this variation. ('1 Cra. 1', [case_citation(reporter='Cranch', reporter_found='Cra.', metadata={'court': 'scotus'})]), # 7. Cranch. --> Not a variant, but could refer to either Cranch's # Supreme Court cases or his DC ones. In this case, we cannot # disambiguate. Years are not known, and we have no further # clues. We must simply drop Cranch from the results. ('1 Cranch 1 1 U.S. 23', [case_citation(page='23')]), # 8. Unsolved problem. In theory, we could use parallel citations # to resolve this, because Rob is getting cited next to La., but # we don't currently know the proximity of citations to each # other, so can't use this. # - Rob. --> Either: # 8.1: A variant of Robards (1862-1865) or # 8.2: Robinson's Louisiana Reports (1841-1846) or # 8.3: Robinson's Virgina Reports (1842-1865) # ('1 Rob. 1 1 La. 1', # [case_citation(volume='1', reporter='Rob.', page='1'), # case_citation(volume='1', reporter='La.', page='1')]), # 9. Johnson #1 should pass and identify the citation ('1 Johnson 1 (1890)', [case_citation(reporter='N.M. (J.)', reporter_found='Johnson', year=1890, )]), # 10. Johnson #2 should fail to disambiguate with year alone ('1 Johnson 1 (1806)', []), ] # fmt: on # all tests in this suite require disambiguation: test_pairs = [ pair + ({"remove_ambiguous": True},) for pair in test_pairs ] self.run_test_pairs(test_pairs, "Disambiguation") def test_custom_tokenizer(self): extractors = [] for e in EXTRACTORS: e = copy(e) e.regex = e.regex.replace(r"\.", r"[.,]") if hasattr(e, "_compiled_regex"): del e._compiled_regex extractors.append(e) tokenizer = Tokenizer(extractors) # fmt: off test_pairs = [ ('1 U,S, 1', [case_citation(reporter_found='U,S,')]), ] # fmt: on self.run_test_pairs( test_pairs, "Custom tokenizer", tokenizers=[tokenizer] )