diff --git a/api/src/main/resources/default-configuration.properties b/api/src/main/resources/default-configuration.properties index 0991f2324..6282d776c 100644 --- a/api/src/main/resources/default-configuration.properties +++ b/api/src/main/resources/default-configuration.properties @@ -72,3 +72,7 @@ any23.extraction.head.meta=on # Allows to specify a CSV file separator and comment delimeter any23.extraction.csv.field=, any23.extraction.csv.comment=# + +# Optimize SingleDocumentExtraction extractor matching and mimetype detection +# by trusting the input DocumentSource content type +any23.extraction.extractor.mimetype.optimization=on \ No newline at end of file diff --git a/api/src/test/java/org/apache/any23/configuration/SettingsTest.java b/api/src/test/java/org/apache/any23/configuration/SettingsTest.java index 80aef91bd..9c96df563 100644 --- a/api/src/test/java/org/apache/any23/configuration/SettingsTest.java +++ b/api/src/test/java/org/apache/any23/configuration/SettingsTest.java @@ -34,7 +34,6 @@ import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; -@SuppressWarnings("ResultOfMethodCallIgnored") public class SettingsTest { @Test diff --git a/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java b/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java index a8acbeac1..415b21eac 100644 --- a/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java +++ b/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java @@ -55,7 +55,6 @@ import java.io.InputStream; import java.io.PrintStream; import java.net.URISyntaxException; -import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; @@ -132,6 +131,8 @@ public SingleDocumentExtraction( tripleHandlers.add(new CountingTripleHandler()); this.output = new CompositeTripleHandler(tripleHandlers); this.encoderDetector = new TikaEncodingDetector(); + if (configuration.getFlagProperty("any23.extraction.extractor.mimetype.optimization") && in.getContentType() != null) + optimizeExtractorMatchingAndMimetypeDetection(in.getContentType()); } /** @@ -153,6 +154,8 @@ public SingleDocumentExtraction( output ); this.setMIMETypeDetector(null); + if (configuration.getFlagProperty("any23.extraction.extractor.mimetype.optimization") && in.getContentType() != null) + optimizeExtractorMatchingAndMimetypeDetection(in.getContentType()); } /** @@ -174,6 +177,20 @@ public SingleDocumentExtraction( output ); this.setMIMETypeDetector(null); + if (configuration.getFlagProperty("any23.extraction.extractor.mimetype.optimization") && in.getContentType() != null) + optimizeExtractorMatchingAndMimetypeDetection(in.getContentType()); + } + + /** + * Simple utility to attempt extractor matches and mimetype detection given + * a {@link DocumentSource#getContentType()}. + * @param contentType String content type obtained from {@link DocumentSource#getContentType()} + * @see https://issues.apache.org/jira/browse/ANY23-43 + */ + private void optimizeExtractorMatchingAndMimetypeDetection(String contentType) { + if (contentType != null) + detectedMIMEType = MIMEType.parse(contentType); + matchingExtractors = extractors.filterByMIMEType(detectedMIMEType); } /** @@ -225,7 +242,9 @@ public SingleDocumentExtractionReport run(ExtractionParameters extractionParamet if (log.isDebugEnabled()) { log.debug("Processing " + this.documentIRI); } - filterExtractorsByMIMEType(); + if (matchingExtractors != null && detectedMIMEType != null) { + filterExtractorsByMIMEType(); + } if(log.isDebugEnabled()) { StringBuilder sb = new StringBuilder("Extractors "); @@ -252,22 +271,22 @@ public SingleDocumentExtractionReport run(ExtractionParameters extractionParamet ); } try { - output.setContentLength(in.getContentLength()); - // Create the document context. + output.setContentLength(in.getContentLength()); + // Create the document context. final String documentLanguage; - try { - documentLanguage = extractDocumentLanguage(extractionParameters); - ArrayList> filteredList = new ArrayList<>(matchingExtractors.getNumOfExtractors()); + try { + documentLanguage = extractDocumentLanguage(extractionParameters); + ArrayList> filteredList = new ArrayList<>(matchingExtractors.getNumOfExtractors()); final boolean mimeTypeIsTooGeneric = isTooGeneric(detectedMIMEType); ArrayList intersectionOfRdfMimetypes = null; for (ExtractorFactory factory : matchingExtractors) { - final Extractor extractor = factory.createExtractor(); - final SingleExtractionReport er = runExtractor( - extractionParameters, - documentLanguage, - extractor - ); - // Fix for ANY23-415: + final Extractor extractor = factory.createExtractor(); + final SingleExtractionReport er = runExtractor( + extractionParameters, + documentLanguage, + extractor + ); + // Fix for ANY23-415: if (mimeTypeIsTooGeneric) { List rdfMimetypes = factory.getSupportedMIMETypes().stream() .filter(mt -> !isTooGeneric(mt)) diff --git a/core/src/main/java/org/apache/any23/source/FileDocumentSource.java b/core/src/main/java/org/apache/any23/source/FileDocumentSource.java index 34fbfa2f3..909abef72 100644 --- a/core/src/main/java/org/apache/any23/source/FileDocumentSource.java +++ b/core/src/main/java/org/apache/any23/source/FileDocumentSource.java @@ -34,16 +34,26 @@ public class FileDocumentSource implements DocumentSource { private final String uri; + private final String contentType; + public FileDocumentSource(File file) { this.file = file; this.uri = file.toURI().toString(); + this.contentType = null; } public FileDocumentSource(File file, String baseIRI) { this.file = file; this.uri = baseIRI; + this.contentType = null; } + public FileDocumentSource(File file, String baseIRI, String contentType) { + this.file = file; + this.uri = baseIRI; + this.contentType = contentType; + } + public InputStream openInputStream() throws IOException { return new BufferedInputStream( new FileInputStream(file) ); } @@ -57,7 +67,7 @@ public String getDocumentIRI() { } public String getContentType() { - return null; + return contentType; } public boolean isLocal() { diff --git a/core/src/test/java/org/apache/any23/extractor/SingleDocumentExtractionTest.java b/core/src/test/java/org/apache/any23/extractor/SingleDocumentExtractionTest.java index a22f7dbbb..56326e915 100644 --- a/core/src/test/java/org/apache/any23/extractor/SingleDocumentExtractionTest.java +++ b/core/src/test/java/org/apache/any23/extractor/SingleDocumentExtractionTest.java @@ -116,7 +116,24 @@ public void tearDown() throws SailException, RepositoryException, TripleHandlerE */ @Test public void testMicroformatDomains() throws IOException, ExtractionException, RepositoryException { - singleDocumentExtraction = getInstance("/microformats/microformat-domains.html"); + singleDocumentExtraction = getInstance("/microformats/microformat-domains.html", false); + singleDocumentExtraction.run(); + logStorageContent(); + assertTripleCount(vSINDICE.getProperty(SINDICE.DOMAIN), "nested.test.com", 1); + } + + /** + * Tests the existence of the domain triples using the SingleDocumentExtraction + * extractor matching and mimetype detection optimization implemented in + * ANY23-43 + * + * @throws IOException if there is an error loading input data + * @throws ExtractionException if an exception is raised during extraction + * @throws RepositoryException if an error is encountered whilst loading content from a storage connection + */ + @Test + public void testMicroformatDomainsAny2343Optimization() throws IOException, ExtractionException, RepositoryException { + singleDocumentExtraction = getInstance("/microformats/microformat-domains.html", true); singleDocumentExtraction.run(); logStorageContent(); assertTripleCount(vSINDICE.getProperty(SINDICE.DOMAIN), "nested.test.com", 1); @@ -137,7 +154,7 @@ public void testMicroformatDomains() throws IOException, ExtractionException, Re */ @Test public void testNestedMicroformats() throws IOException, ExtractionException, RepositoryException { - singleDocumentExtraction = getInstance("/microformats/nested-microformats-a1.html"); + singleDocumentExtraction = getInstance("/microformats/nested-microformats-a1.html", false); singleDocumentExtraction.run(); logStorageContent(); @@ -160,7 +177,7 @@ public void testNestedMicroformats() throws IOException, ExtractionException, Re */ @Test public void testNestedVCardAdr() throws IOException, ExtractionException, RepositoryException { - singleDocumentExtraction = getInstance("/microformats/nested-microformats-a3.html"); + singleDocumentExtraction = getInstance("/microformats/nested-microformats-a3.html", false); singleDocumentExtraction.run(); logStorageContent(); @@ -187,7 +204,7 @@ public void testNestedVCardAdr() throws IOException, ExtractionException, Reposi */ @Test public void testNestedMicroformatsInduced() throws IOException, ExtractionException, RepositoryException { - singleDocumentExtraction = getInstance("/microformats/nested-microformats-a2.html"); + singleDocumentExtraction = getInstance("/microformats/nested-microformats-a2.html", false); singleDocumentExtraction.run(); logStorageContent(); @@ -214,7 +231,7 @@ public void testNestedMicroformatsInduced() throws IOException, ExtractionExcept * show the triple property as double. Despite this the model contains it just once. */ public void testNestedMicroformatsManaged() throws IOException, ExtractionException, RepositoryException { - singleDocumentExtraction = getInstance("/microformats/nested-microformats-managed.html"); + singleDocumentExtraction = getInstance("/microformats/nested-microformats-managed.html", false); singleDocumentExtraction.run(); logStorageContent(); @@ -229,7 +246,7 @@ public void testNestedMicroformatsManaged() throws IOException, ExtractionExcept assertTripleCount(vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL) , vREVIEW.hasReview, 1); } - private SingleDocumentExtraction getInstance(String file) throws FileNotFoundException, IOException { + private SingleDocumentExtraction getInstance(String file, Boolean optimizeMimeTypeAndExtractorSelection) throws FileNotFoundException, IOException { baos = new ByteArrayOutputStream(); rdfxmlWriter = new RDFXMLWriter(baos); repositoryWriter = new RepositoryWriter(conn); @@ -238,18 +255,18 @@ private SingleDocumentExtraction getInstance(String file) throws FileNotFoundExc cth.addChild(rdfxmlWriter); cth.addChild(repositoryWriter); - final ModifiableConfiguration configuration = DefaultConfiguration.copy(); - configuration.setProperty("any23.extraction.metadata.domain.per.entity", "on"); - SingleDocumentExtraction instance = new SingleDocumentExtraction( - configuration, - new HTMLFixture(copyResourceToTempFile(file)).getOpener("http://nested.test.com"), - extractorGroup, - cth - ); - instance.setMIMETypeDetector( new TikaMIMETypeDetector(new WhiteSpacesPurifier()) ); - return instance; - } - + final ModifiableConfiguration configuration = DefaultConfiguration.copy(); + configuration.setProperty("any23.extraction.metadata.domain.per.entity", "on"); + SingleDocumentExtraction instance = new SingleDocumentExtraction( + configuration, + new HTMLFixture(copyResourceToTempFile(file)) + .getOpener("http://nested.test.com", optimizeMimeTypeAndExtractorSelection), + extractorGroup, + cth + ); + instance.setMIMETypeDetector(new TikaMIMETypeDetector(new WhiteSpacesPurifier()) ); + return instance; + } /** * Logs the storage content. * @throws RepositoryException if an error is encountered whilst loading content from a storage connection diff --git a/core/src/test/java/org/apache/any23/extractor/html/AbstractExtractorTestCase.java b/core/src/test/java/org/apache/any23/extractor/html/AbstractExtractorTestCase.java index 9241041d5..67e2b57e9 100644 --- a/core/src/test/java/org/apache/any23/extractor/html/AbstractExtractorTestCase.java +++ b/core/src/test/java/org/apache/any23/extractor/html/AbstractExtractorTestCase.java @@ -189,7 +189,7 @@ protected void extract(String resource) throws ExtractionException, IOException { SingleDocumentExtraction ex = new SingleDocumentExtraction( new HTMLFixture(copyResourceToTempFile(resource)).getOpener(baseIRI - .toString()), getExtractorFactory(), + .toString(), false), getExtractorFactory(), new RepositoryWriter(conn)); ex.setMIMETypeDetector(null); report = ex.run(); diff --git a/core/src/test/java/org/apache/any23/extractor/html/HTMLFixture.java b/core/src/test/java/org/apache/any23/extractor/html/HTMLFixture.java index 69ef6c78e..533a0710f 100644 --- a/core/src/test/java/org/apache/any23/extractor/html/HTMLFixture.java +++ b/core/src/test/java/org/apache/any23/extractor/html/HTMLFixture.java @@ -27,7 +27,7 @@ import java.io.IOException; /** - * This class is a wrapper around an HTML document providing a simply facade. + * This class is a wrapper around an HTML document providing a simple facade. */ public class HTMLFixture { @@ -43,8 +43,21 @@ private File getFile() { return file; } - public DocumentSource getOpener(String baseIRI) { - return new FileDocumentSource(getFile(), baseIRI); + /** + * + * @param baseIRI the base IRI to use for the DocumentSource + * @param optimizeMimeTypeAndExtractorSelection if you wish to optimize + * SingleDocumentExtraction extractor matching and mimetype detection by + * trusting the input DocumentSource content type. See + * https://issues.apache.org/jira/projects/ANY23/issues/ANY23-43 + * @return the document source which is actually a {@link FileDocumentSource} + */ + public DocumentSource getOpener(String baseIRI, Boolean optimizeMimeTypeAndExtractorSelection) { + if (optimizeMimeTypeAndExtractorSelection) { + return new FileDocumentSource(getFile(), baseIRI, "text/html"); + } else { + return new FileDocumentSource(getFile(), baseIRI); + } } /**