diff --git a/docs/acl-anthology.md b/docs/acl-anthology.md index 11ad0e4692..6423b649bb 100644 --- a/docs/acl-anthology.md +++ b/docs/acl-anthology.md @@ -30,6 +30,13 @@ mkdir -p build/data Generate cleaned YAML data: +1. Add the following lines to `bin/create_hugo_yaml.py` before function `export_anthology` +```python +# Prevent yaml from creating aliases which can't be parsed by anserini +Dumper.ignore_aliases = lambda self, data: True +``` + +2. Execute the following script: ```bash python bin/create_hugo_yaml.py ``` diff --git a/src/main/java/io/anserini/collection/AclAnthology.java b/src/main/java/io/anserini/collection/AclAnthology.java index 3e58c6003a..e3b7631759 100644 --- a/src/main/java/io/anserini/collection/AclAnthology.java +++ b/src/main/java/io/anserini/collection/AclAnthology.java @@ -22,6 +22,7 @@ import com.fasterxml.jackson.databind.node.ArrayNode; import com.fasterxml.jackson.databind.node.ObjectNode; import com.fasterxml.jackson.dataformat.yaml.YAMLFactory; +import org.yaml.snakeyaml.LoaderOptions; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -53,11 +54,17 @@ public AclAnthology(Path path) { this.path = Paths.get(path.toString(), "/papers"); // Path containing files to iterate this.allowedFileSuffix = Set.of(".yaml"); - ObjectMapper mapper = new ObjectMapper(new YAMLFactory()); + LoaderOptions loaderOptions = new LoaderOptions(); + loaderOptions.setCodePointLimit(10 * 1024 * 1024); // 10 MB + YAMLFactory yamlFactory = YAMLFactory.builder() + .loaderOptions(loaderOptions) + .build(); + + ObjectMapper mapper = new ObjectMapper(yamlFactory); try { this.volumes = mapper.readValue(new File(path.toString(), "/volumes.yaml"), JsonNode.class); } catch (IOException e) { - LOG.error("Unable to open volumes.yaml"); + LOG.error(e); return; } } @@ -86,7 +93,13 @@ public Segment(Path path) throws IOException { // read YAML file into JsonNode format bufferedReader = new BufferedReader(new FileReader(path.toString())); - ObjectMapper mapper = new ObjectMapper(new YAMLFactory()); + LoaderOptions loaderOptions = new LoaderOptions(); + loaderOptions.setCodePointLimit(10 * 1024 * 1024); // 10 MB + YAMLFactory yamlFactory = YAMLFactory.builder() + .loaderOptions(loaderOptions) + .build(); + + ObjectMapper mapper = new ObjectMapper(yamlFactory); MappingIterator iterator = mapper.readerFor(JsonNode.class).readValues(bufferedReader); if (iterator.hasNext()) { @@ -155,7 +168,7 @@ public Document(Map.Entry jsonEntry) { // Process venue facets venues = new ArrayList<>(); - ArrayNode venuesNode = (ArrayNode) volume.get("venues"); + ArrayNode venuesNode = (ArrayNode) paper.get("venue"); venuesNode.elements().forEachRemaining(node -> venues.add(node.asText())); // Process SIG facets diff --git a/src/main/java/io/anserini/index/generator/AclAnthologyGenerator.java b/src/main/java/io/anserini/index/generator/AclAnthologyGenerator.java index 3b1c5f66fd..03aa473510 100644 --- a/src/main/java/io/anserini/index/generator/AclAnthologyGenerator.java +++ b/src/main/java/io/anserini/index/generator/AclAnthologyGenerator.java @@ -76,13 +76,13 @@ private enum AclAnthologyField { AclAnthologyField.THUMBNAIL.name); public static final List NUMERIC_FIELD_NAMES = List.of( - AclAnthologyField.YEAR.name, - AclAnthologyField.PAGE_FIRST.name, - AclAnthologyField.PAGE_LAST.name); + AclAnthologyField.YEAR.name); public static final List FIELDS_WITHOUT_STEMMING = List.of( AclAnthologyField.AUTHOR_STRING.name, AclAnthologyField.PUBLISHER.name, + AclAnthologyField.PAGE_FIRST.name, + AclAnthologyField.PAGE_LAST.name, AclAnthologyField.MONTH.name); public AclAnthologyGenerator(IndexCollection.Args args) { diff --git a/src/test/java/io/anserini/collection/AclAnthologyTest.java b/src/test/java/io/anserini/collection/AclAnthologyTest.java index b78749252b..0de4dd2b37 100644 --- a/src/test/java/io/anserini/collection/AclAnthologyTest.java +++ b/src/test/java/io/anserini/collection/AclAnthologyTest.java @@ -52,13 +52,13 @@ public void setUp() throws Exception { doc1.put("booktitle", "COLING 2000 Volume 1: The 18th International Conference on Computational Linguistics"); doc1.put("paper_id", "3"); doc1.put("parent_volume_id", "C00-1"); - doc1.put("pdf", "https://www.aclweb.org/anthology/C00-1003.pdf"); - doc1.put("thumbnail", "https://www.aclweb.org/anthology/thumb/C00-1003.jpg"); + doc1.put("pdf", "https://aclanthology.org/C00-1003.pdf"); + doc1.put("thumbnail", "https://aclanthology.org/thumb/C00-1003.jpg"); doc1.put("title", "Selectional Restrictions in HPSG"); - doc1.put("url", "https://www.aclweb.org/anthology/C00-1003"); + doc1.put("url", "https://aclanthology.org/C00-1003"); doc1.put("contents", "Selectional Restrictions in HPSG "); doc1.put("sigs", ""); - doc1.put("venues", "COLING"); + doc1.put("venues", "coling"); expected.put("C00-1003", doc1); HashMap doc2 = new HashMap<>(); @@ -67,7 +67,7 @@ public void setUp() throws Exception { doc2.put("title", "Exploiting a Probabilistic Hierarchical Model for Generation"); doc2.put("contents", "Exploiting a Probabilistic Hierarchical Model for Generation "); doc2.put("sigs", ""); - doc2.put("venues", "COLING"); + doc2.put("venues", "coling"); expected.put("C00-1007", doc2); HashMap doc3 = new HashMap<>(); @@ -85,11 +85,11 @@ public void setUp() throws Exception { doc3.put("page_last", "34"); doc3.put("paper_id", "3"); doc3.put("parent_volume_id", "E17-1"); - doc3.put("pdf", "https://www.aclweb.org/anthology/E17-1003.pdf"); + doc3.put("pdf", "https://aclanthology.org/E17-1003.pdf"); doc3.put("publisher", "Association for Computational Linguistics"); - doc3.put("thumbnail", "https://www.aclweb.org/anthology/thumb/E17-1003.jpg"); + doc3.put("thumbnail", "https://aclanthology.org/thumb/E17-1003.jpg"); doc3.put("title", "Exploring Different Dimensions of Attention for Uncertainty Detection"); - doc3.put("url", "https://www.aclweb.org/anthology/E17-1003"); + doc3.put("url", "https://aclanthology.org/E17-1003"); doc3.put("contents", "Exploring Different Dimensions of Attention for Uncertainty Detection " + "Neural networks with attention have proven effective for many natural " + "language processing tasks. In this paper, we develop attention mechanisms for " + @@ -102,7 +102,7 @@ public void setUp() throws Exception { "perform similar to the state-of-the-art model on a biomedical benchmark which " + "uses a large set of linguistic features."); doc3.put("sigs", ""); - doc3.put("venues", "EACL"); + doc3.put("venues", "eacl"); expected.put("E17-1003", doc3); } diff --git a/src/test/java/io/anserini/integration/AclAnthologyEndToEndTest.java b/src/test/java/io/anserini/integration/AclAnthologyEndToEndTest.java index 49c0af9e57..70f6edf6f2 100644 --- a/src/test/java/io/anserini/integration/AclAnthologyEndToEndTest.java +++ b/src/test/java/io/anserini/integration/AclAnthologyEndToEndTest.java @@ -71,12 +71,12 @@ protected void setCheckIndexGroundTruth() { "raw", "Exploiting a Probabilistic Hierarchical Model for Generation ")); - fieldNormStatusTotalFields = 13; - termIndexStatusTermCount = 241; - termIndexStatusTotFreq = 288; + fieldNormStatusTotalFields = 21; + termIndexStatusTermCount = 330; + termIndexStatusTotFreq = 411; storedFieldStatusTotalDocCounts = 3; - termIndexStatusTotPos = 339; - storedFieldStatusTotFields = 67; + termIndexStatusTotPos = 470; + storedFieldStatusTotFields = 83; } @Override diff --git a/src/test/resources/sample_docs/acl/papers/segment1.yaml b/src/test/resources/sample_docs/acl/papers/segment1.yaml index d66430cd3b..066d28a8a3 100644 --- a/src/test/resources/sample_docs/acl/papers/segment1.yaml +++ b/src/test/resources/sample_docs/acl/papers/segment1.yaml @@ -15,13 +15,22 @@ C00-1003: Linguistics' booktitle_html: 'COLING 2000 Volume 1: The 18th International Conference on Computational Linguistics' + citation: '[Selectional Restrictions in HPSG](https://aclanthology.org/C00-1003) + (Androutsopoulos & Dale, COLING 2000)' + citation_acl: 'Ion Androutsopoulos and Robert Dale. 2000. Selectional + Restrictions in HPSG. In COLING 2000 Volume 1: The 18th International Conference + on Computational Linguistics.' + events: [] + language: null paper_id: '3' parent_volume_id: C00-1 - pdf: https://www.aclweb.org/anthology/C00-1003.pdf - thumbnail: https://www.aclweb.org/anthology/thumb/C00-1003.jpg + pdf: https://aclanthology.org/C00-1003.pdf + thumbnail: https://aclanthology.org/thumb/C00-1003.jpg title: Selectional Restrictions in HPSG title_html: Selectional Restrictions in HPSG - url: https://www.aclweb.org/anthology/C00-1003 + url: https://aclanthology.org/C00-1003 + venue: + - coling year: '2000' C00-1007: author: @@ -40,11 +49,20 @@ C00-1007: Linguistics' booktitle_html: 'COLING 2000 Volume 1: The 18th International Conference on Computational Linguistics' + citation: '[Exploiting a Probabilistic Hierarchical Model for Generation](https://aclanthology.org/C00-1007) + (Bangalore & Rambow, COLING 2000)' + citation_acl: 'Srinivas Bangalore and Owen Rambow. 2000. Exploiting + a Probabilistic Hierarchical Model for Generation. In COLING 2000 Volume + 1: The 18th International Conference on Computational Linguistics.' + events: [] + language: null paper_id: '7' parent_volume_id: C00-1 - pdf: https://www.aclweb.org/anthology/C00-1007.pdf - thumbnail: https://www.aclweb.org/anthology/thumb/C00-1007.jpg + pdf: https://aclanthology.org/C00-1007.pdf + thumbnail: https://aclanthology.org/thumb/C00-1007.jpg title: Exploiting a Probabilistic Hierarchical Model for Generation title_html: Exploiting a Probabilistic Hierarchical Model for Generation - url: https://www.aclweb.org/anthology/C00-1007 + url: https://aclanthology.org/C00-1007 + venue: + - coling year: '2000' diff --git a/src/test/resources/sample_docs/acl/papers/segment2.yaml b/src/test/resources/sample_docs/acl/papers/segment2.yaml index ea191b69f6..83e3ef5506 100644 --- a/src/test/resources/sample_docs/acl/papers/segment2.yaml +++ b/src/test/resources/sample_docs/acl/papers/segment2.yaml @@ -26,16 +26,30 @@ E17-1003: for Computational Linguistics: Volume 1, Long Papers' booktitle_html: 'Proceedings of the 15th Conference of the European Chapter of the Association for Computational Linguistics: Volume 1, Long Papers' + citation: "[Exploring Different Dimensions of Attention for Uncertainty Detection](https://aclanthology.org/E17-1003) + (Adel & Sch\xFCtze, EACL 2017)" + citation_acl: "Heike Adel and Hinrich Sch\xFCtze. 2017. Exploring + Different Dimensions of Attention for Uncertainty Detection. In Proceedings + of the 15th Conference of the European Chapter of the Association for Computational + Linguistics: Volume 1, Long Papers, pages 22\u201334, Valencia, Spain. Association + for Computational Linguistics." + events: [] + language: null month: April page_first: '22' page_last: '34' pages: "22\u201334" paper_id: '3' parent_volume_id: E17-1 - pdf: https://www.aclweb.org/anthology/E17-1003.pdf + pdf: https://aclanthology.org/E17-1003.pdf publisher: Association for Computational Linguistics - thumbnail: https://www.aclweb.org/anthology/thumb/E17-1003.jpg + pwcdataset: + - name: SST + url: https://paperswithcode.com/dataset/sst + thumbnail: https://aclanthology.org/thumb/E17-1003.jpg title: Exploring Different Dimensions of Attention for Uncertainty Detection title_html: Exploring Different Dimensions of Attention for Uncertainty Detection - url: https://www.aclweb.org/anthology/E17-1003 + url: https://aclanthology.org/E17-1003 + venue: + - eacl year: '2017' diff --git a/src/test/resources/sample_docs/acl/volumes.yaml b/src/test/resources/sample_docs/acl/volumes.yaml index ec48042332..2cc5f56657 100644 --- a/src/test/resources/sample_docs/acl/volumes.yaml +++ b/src/test/resources/sample_docs/acl/volumes.yaml @@ -1,4 +1,5 @@ C00-1: + events: [] has_abstracts: false meta_date: '2000' papers: @@ -9,8 +10,11 @@ C00-1: Linguistics' title_html: 'COLING 2000 Volume 1: The 18th International Conference on Computational Linguistics' + url: https://aclanthology.org/C00-1 + venue: + - coling venues: - - COLING + - coling year: '2000' E17-1: address: Valencia, Spain @@ -27,19 +31,22 @@ E17-1: full: Alexander Koller id: alexander-koller last: Koller + events: [] has_abstracts: true meta_date: 2017/4 month: April papers: - - E17-1003 - pdf: https://www.aclweb.org/anthology/E17-1.pdf + - E17-1000 + pdf: https://aclanthology.org/E17-1.pdf publisher: Association for Computational Linguistics sigs: [] title: 'Proceedings of the 15th Conference of the European Chapter of the Association for Computational Linguistics: Volume 1, Long Papers' title_html: 'Proceedings of the 15th Conference of the European Chapter of the Association for Computational Linguistics: Volume 1, Long Papers' - url: https://www.aclweb.org/anthology/E17-1 + url: https://aclanthology.org/E17-1 + venue: + - eacl venues: - - EACL + - eacl year: '2017'