diff --git a/api/src/main/resources/default-configuration.properties b/api/src/main/resources/default-configuration.properties
index 0991f2324..6282d776c 100644
--- a/api/src/main/resources/default-configuration.properties
+++ b/api/src/main/resources/default-configuration.properties
@@ -72,3 +72,7 @@ any23.extraction.head.meta=on
# Allows to specify a CSV file separator and comment delimeter
any23.extraction.csv.field=,
any23.extraction.csv.comment=#
+
+# Optimize SingleDocumentExtraction extractor matching and mimetype detection
+# by trusting the input DocumentSource content type
+any23.extraction.extractor.mimetype.optimization=on
\ No newline at end of file
diff --git a/api/src/test/java/org/apache/any23/configuration/SettingsTest.java b/api/src/test/java/org/apache/any23/configuration/SettingsTest.java
index 80aef91bd..9c96df563 100644
--- a/api/src/test/java/org/apache/any23/configuration/SettingsTest.java
+++ b/api/src/test/java/org/apache/any23/configuration/SettingsTest.java
@@ -34,7 +34,6 @@
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
-@SuppressWarnings("ResultOfMethodCallIgnored")
public class SettingsTest {
@Test
diff --git a/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java b/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java
index a8acbeac1..415b21eac 100644
--- a/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java
+++ b/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java
@@ -55,7 +55,6 @@
import java.io.InputStream;
import java.io.PrintStream;
import java.net.URISyntaxException;
-import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
@@ -132,6 +131,8 @@ public SingleDocumentExtraction(
tripleHandlers.add(new CountingTripleHandler());
this.output = new CompositeTripleHandler(tripleHandlers);
this.encoderDetector = new TikaEncodingDetector();
+ if (configuration.getFlagProperty("any23.extraction.extractor.mimetype.optimization") && in.getContentType() != null)
+ optimizeExtractorMatchingAndMimetypeDetection(in.getContentType());
}
/**
@@ -153,6 +154,8 @@ public SingleDocumentExtraction(
output
);
this.setMIMETypeDetector(null);
+ if (configuration.getFlagProperty("any23.extraction.extractor.mimetype.optimization") && in.getContentType() != null)
+ optimizeExtractorMatchingAndMimetypeDetection(in.getContentType());
}
/**
@@ -174,6 +177,20 @@ public SingleDocumentExtraction(
output
);
this.setMIMETypeDetector(null);
+ if (configuration.getFlagProperty("any23.extraction.extractor.mimetype.optimization") && in.getContentType() != null)
+ optimizeExtractorMatchingAndMimetypeDetection(in.getContentType());
+ }
+
+ /**
+ * Simple utility to attempt extractor matches and mimetype detection given
+ * a {@link DocumentSource#getContentType()}.
+ * @param contentType String content type obtained from {@link DocumentSource#getContentType()}
+ * @see https://issues.apache.org/jira/browse/ANY23-43
+ */
+ private void optimizeExtractorMatchingAndMimetypeDetection(String contentType) {
+ if (contentType != null)
+ detectedMIMEType = MIMEType.parse(contentType);
+ matchingExtractors = extractors.filterByMIMEType(detectedMIMEType);
}
/**
@@ -225,7 +242,9 @@ public SingleDocumentExtractionReport run(ExtractionParameters extractionParamet
if (log.isDebugEnabled()) {
log.debug("Processing " + this.documentIRI);
}
- filterExtractorsByMIMEType();
+ if (matchingExtractors != null && detectedMIMEType != null) {
+ filterExtractorsByMIMEType();
+ }
if(log.isDebugEnabled()) {
StringBuilder sb = new StringBuilder("Extractors ");
@@ -252,22 +271,22 @@ public SingleDocumentExtractionReport run(ExtractionParameters extractionParamet
);
}
try {
- output.setContentLength(in.getContentLength());
- // Create the document context.
+ output.setContentLength(in.getContentLength());
+ // Create the document context.
final String documentLanguage;
- try {
- documentLanguage = extractDocumentLanguage(extractionParameters);
- ArrayList> filteredList = new ArrayList<>(matchingExtractors.getNumOfExtractors());
+ try {
+ documentLanguage = extractDocumentLanguage(extractionParameters);
+ ArrayList> filteredList = new ArrayList<>(matchingExtractors.getNumOfExtractors());
final boolean mimeTypeIsTooGeneric = isTooGeneric(detectedMIMEType);
ArrayList intersectionOfRdfMimetypes = null;
for (ExtractorFactory> factory : matchingExtractors) {
- final Extractor> extractor = factory.createExtractor();
- final SingleExtractionReport er = runExtractor(
- extractionParameters,
- documentLanguage,
- extractor
- );
- // Fix for ANY23-415:
+ final Extractor> extractor = factory.createExtractor();
+ final SingleExtractionReport er = runExtractor(
+ extractionParameters,
+ documentLanguage,
+ extractor
+ );
+ // Fix for ANY23-415:
if (mimeTypeIsTooGeneric) {
List rdfMimetypes = factory.getSupportedMIMETypes().stream()
.filter(mt -> !isTooGeneric(mt))
diff --git a/core/src/main/java/org/apache/any23/source/FileDocumentSource.java b/core/src/main/java/org/apache/any23/source/FileDocumentSource.java
index 34fbfa2f3..909abef72 100644
--- a/core/src/main/java/org/apache/any23/source/FileDocumentSource.java
+++ b/core/src/main/java/org/apache/any23/source/FileDocumentSource.java
@@ -34,16 +34,26 @@ public class FileDocumentSource implements DocumentSource {
private final String uri;
+ private final String contentType;
+
public FileDocumentSource(File file) {
this.file = file;
this.uri = file.toURI().toString();
+ this.contentType = null;
}
public FileDocumentSource(File file, String baseIRI) {
this.file = file;
this.uri = baseIRI;
+ this.contentType = null;
}
+ public FileDocumentSource(File file, String baseIRI, String contentType) {
+ this.file = file;
+ this.uri = baseIRI;
+ this.contentType = contentType;
+ }
+
public InputStream openInputStream() throws IOException {
return new BufferedInputStream( new FileInputStream(file) );
}
@@ -57,7 +67,7 @@ public String getDocumentIRI() {
}
public String getContentType() {
- return null;
+ return contentType;
}
public boolean isLocal() {
diff --git a/core/src/test/java/org/apache/any23/extractor/SingleDocumentExtractionTest.java b/core/src/test/java/org/apache/any23/extractor/SingleDocumentExtractionTest.java
index a22f7dbbb..56326e915 100644
--- a/core/src/test/java/org/apache/any23/extractor/SingleDocumentExtractionTest.java
+++ b/core/src/test/java/org/apache/any23/extractor/SingleDocumentExtractionTest.java
@@ -116,7 +116,24 @@ public void tearDown() throws SailException, RepositoryException, TripleHandlerE
*/
@Test
public void testMicroformatDomains() throws IOException, ExtractionException, RepositoryException {
- singleDocumentExtraction = getInstance("/microformats/microformat-domains.html");
+ singleDocumentExtraction = getInstance("/microformats/microformat-domains.html", false);
+ singleDocumentExtraction.run();
+ logStorageContent();
+ assertTripleCount(vSINDICE.getProperty(SINDICE.DOMAIN), "nested.test.com", 1);
+ }
+
+ /**
+ * Tests the existence of the domain triples using the SingleDocumentExtraction
+ * extractor matching and mimetype detection optimization implemented in
+ * ANY23-43
+ *
+ * @throws IOException if there is an error loading input data
+ * @throws ExtractionException if an exception is raised during extraction
+ * @throws RepositoryException if an error is encountered whilst loading content from a storage connection
+ */
+ @Test
+ public void testMicroformatDomainsAny2343Optimization() throws IOException, ExtractionException, RepositoryException {
+ singleDocumentExtraction = getInstance("/microformats/microformat-domains.html", true);
singleDocumentExtraction.run();
logStorageContent();
assertTripleCount(vSINDICE.getProperty(SINDICE.DOMAIN), "nested.test.com", 1);
@@ -137,7 +154,7 @@ public void testMicroformatDomains() throws IOException, ExtractionException, Re
*/
@Test
public void testNestedMicroformats() throws IOException, ExtractionException, RepositoryException {
- singleDocumentExtraction = getInstance("/microformats/nested-microformats-a1.html");
+ singleDocumentExtraction = getInstance("/microformats/nested-microformats-a1.html", false);
singleDocumentExtraction.run();
logStorageContent();
@@ -160,7 +177,7 @@ public void testNestedMicroformats() throws IOException, ExtractionException, Re
*/
@Test
public void testNestedVCardAdr() throws IOException, ExtractionException, RepositoryException {
- singleDocumentExtraction = getInstance("/microformats/nested-microformats-a3.html");
+ singleDocumentExtraction = getInstance("/microformats/nested-microformats-a3.html", false);
singleDocumentExtraction.run();
logStorageContent();
@@ -187,7 +204,7 @@ public void testNestedVCardAdr() throws IOException, ExtractionException, Reposi
*/
@Test
public void testNestedMicroformatsInduced() throws IOException, ExtractionException, RepositoryException {
- singleDocumentExtraction = getInstance("/microformats/nested-microformats-a2.html");
+ singleDocumentExtraction = getInstance("/microformats/nested-microformats-a2.html", false);
singleDocumentExtraction.run();
logStorageContent();
@@ -214,7 +231,7 @@ public void testNestedMicroformatsInduced() throws IOException, ExtractionExcept
* show the triple property as double. Despite this the model contains it just once.
*/
public void testNestedMicroformatsManaged() throws IOException, ExtractionException, RepositoryException {
- singleDocumentExtraction = getInstance("/microformats/nested-microformats-managed.html");
+ singleDocumentExtraction = getInstance("/microformats/nested-microformats-managed.html", false);
singleDocumentExtraction.run();
logStorageContent();
@@ -229,7 +246,7 @@ public void testNestedMicroformatsManaged() throws IOException, ExtractionExcept
assertTripleCount(vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL) , vREVIEW.hasReview, 1);
}
- private SingleDocumentExtraction getInstance(String file) throws FileNotFoundException, IOException {
+ private SingleDocumentExtraction getInstance(String file, Boolean optimizeMimeTypeAndExtractorSelection) throws FileNotFoundException, IOException {
baos = new ByteArrayOutputStream();
rdfxmlWriter = new RDFXMLWriter(baos);
repositoryWriter = new RepositoryWriter(conn);
@@ -238,18 +255,18 @@ private SingleDocumentExtraction getInstance(String file) throws FileNotFoundExc
cth.addChild(rdfxmlWriter);
cth.addChild(repositoryWriter);
- final ModifiableConfiguration configuration = DefaultConfiguration.copy();
- configuration.setProperty("any23.extraction.metadata.domain.per.entity", "on");
- SingleDocumentExtraction instance = new SingleDocumentExtraction(
- configuration,
- new HTMLFixture(copyResourceToTempFile(file)).getOpener("http://nested.test.com"),
- extractorGroup,
- cth
- );
- instance.setMIMETypeDetector( new TikaMIMETypeDetector(new WhiteSpacesPurifier()) );
- return instance;
- }
-
+ final ModifiableConfiguration configuration = DefaultConfiguration.copy();
+ configuration.setProperty("any23.extraction.metadata.domain.per.entity", "on");
+ SingleDocumentExtraction instance = new SingleDocumentExtraction(
+ configuration,
+ new HTMLFixture(copyResourceToTempFile(file))
+ .getOpener("http://nested.test.com", optimizeMimeTypeAndExtractorSelection),
+ extractorGroup,
+ cth
+ );
+ instance.setMIMETypeDetector(new TikaMIMETypeDetector(new WhiteSpacesPurifier()) );
+ return instance;
+ }
/**
* Logs the storage content.
* @throws RepositoryException if an error is encountered whilst loading content from a storage connection
diff --git a/core/src/test/java/org/apache/any23/extractor/html/AbstractExtractorTestCase.java b/core/src/test/java/org/apache/any23/extractor/html/AbstractExtractorTestCase.java
index 9241041d5..67e2b57e9 100644
--- a/core/src/test/java/org/apache/any23/extractor/html/AbstractExtractorTestCase.java
+++ b/core/src/test/java/org/apache/any23/extractor/html/AbstractExtractorTestCase.java
@@ -189,7 +189,7 @@ protected void extract(String resource) throws ExtractionException,
IOException {
SingleDocumentExtraction ex = new SingleDocumentExtraction(
new HTMLFixture(copyResourceToTempFile(resource)).getOpener(baseIRI
- .toString()), getExtractorFactory(),
+ .toString(), false), getExtractorFactory(),
new RepositoryWriter(conn));
ex.setMIMETypeDetector(null);
report = ex.run();
diff --git a/core/src/test/java/org/apache/any23/extractor/html/HTMLFixture.java b/core/src/test/java/org/apache/any23/extractor/html/HTMLFixture.java
index 69ef6c78e..533a0710f 100644
--- a/core/src/test/java/org/apache/any23/extractor/html/HTMLFixture.java
+++ b/core/src/test/java/org/apache/any23/extractor/html/HTMLFixture.java
@@ -27,7 +27,7 @@
import java.io.IOException;
/**
- * This class is a wrapper around an HTML document providing a simply facade.
+ * This class is a wrapper around an HTML document providing a simple facade.
*/
public class HTMLFixture {
@@ -43,8 +43,21 @@ private File getFile() {
return file;
}
- public DocumentSource getOpener(String baseIRI) {
- return new FileDocumentSource(getFile(), baseIRI);
+ /**
+ *
+ * @param baseIRI the base IRI to use for the DocumentSource
+ * @param optimizeMimeTypeAndExtractorSelection if you wish to optimize
+ * SingleDocumentExtraction extractor matching and mimetype detection by
+ * trusting the input DocumentSource content type. See
+ * https://issues.apache.org/jira/projects/ANY23/issues/ANY23-43
+ * @return the document source which is actually a {@link FileDocumentSource}
+ */
+ public DocumentSource getOpener(String baseIRI, Boolean optimizeMimeTypeAndExtractorSelection) {
+ if (optimizeMimeTypeAndExtractorSelection) {
+ return new FileDocumentSource(getFile(), baseIRI, "text/html");
+ } else {
+ return new FileDocumentSource(getFile(), baseIRI);
+ }
}
/**