Configure Tika correctly.
With the new configuration, Tika can now extract text from PDFs and
XML documents.
Also configures logging for the application.
Change-Id: I7a89c2b232ed4e220665dd335a5f5a0cc3ef2994
diff --git a/pom.xml b/pom.xml
index 2bc4582..3546067 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1,6 +1,8 @@
<?xml version="1.0"?>
-<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" xmlns="http://maven.apache.org/POM/4.0.0"
- xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+<project
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"
+ xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<modelVersion>4.0.0</modelVersion>
@@ -30,13 +32,19 @@
<basic-annotations.version>0.2.0</basic-annotations.version>
<findbugs-jsr305.version>3.0.2</findbugs-jsr305.version>
<google.java.format.version>1.8</google.java.format.version>
- <itext7.version>7.1.13</itext7.version>
<kotlin-annotations.version>1.4.10</kotlin-annotations.version>
<lanterna.version>3.0.4</lanterna.version>
<picocli.version>4.3.2</picocli.version>
<pdfocr.version>1.0.2</pdfocr.version>
<term4j.version>0.4.0</term4j.version>
- <tika.version>1.24.1</tika.version>
+ <tess4j.version>4.5.1</tess4j.version>
+ <tika.version>1.25</tika.version>
+ <jbig2-imageio.version>3.0.3</jbig2-imageio.version>
+ <jai-imageio-core.version>1.4.0</jai-imageio-core.version>
+ <jai-imageio-jpeg2000.version>1.3.0</jai-imageio-jpeg2000.version>
+ <sqlite-jdbc.version>3.30.1</sqlite-jdbc.version>
+ <jboss-logging.version>3.4.1.Final</jboss-logging.version>
+ <slf4j.version>1.7.30</slf4j.version>
</properties>
<dependencies>
@@ -68,6 +76,13 @@
<version>${picocli.version}</version>
</dependency>
+ <!-- Logging -->
+ <dependency>
+ <groupId>org.jboss.logging</groupId>
+ <artifactId>jboss-logging</artifactId>
+ <version>${jboss-logging.version}</version>
+ </dependency>
+
<!-- Document reading -->
<dependency>
<groupId>org.apache.tika</groupId>
@@ -80,9 +95,54 @@
<version>${tika.version}</version>
</dependency>
<dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parsers</artifactId>
+ <version>${tika.version}</version>
+ <exclusions>
+ <exclusion>
+ <groupId>org.slf4j</groupId>
+ <artifactId>jcl-over-slf4j</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.slf4j</groupId>
+ <artifactId>jul-to-slf4j</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.pdfbox</groupId>
+ <artifactId>jbig2-imageio</artifactId>
+ <version>${jbig2-imageio.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>com.github.jai-imageio</groupId>
+ <artifactId>jai-imageio-core</artifactId>
+ <version>${jai-imageio-core.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>com.github.jai-imageio</groupId>
+ <artifactId>jai-imageio-jpeg2000</artifactId>
+ <version>${jai-imageio-jpeg2000.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.xerial</groupId>
+ <artifactId>sqlite-jdbc</artifactId>
+ <version>${sqlite-jdbc.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-jdk14</artifactId>
+ <version>${slf4j.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>log4j-over-slf4j</artifactId>
+ <version>${slf4j.version}</version>
+ </dependency>
+ <dependency>
<groupId>net.sourceforge.tess4j</groupId>
<artifactId>tess4j</artifactId>
- <version>4.5.1</version>
+ <version>${tess4j.version}</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
@@ -92,6 +152,14 @@
<groupId>org.slf4j</groupId>
<artifactId>log4j-over-slf4j</artifactId>
</exclusion>
+ <exclusion>
+ <groupId>org.slf4j</groupId>
+ <artifactId>jul-to-slf4j</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>ch.qos.logback</groupId>
+ <artifactId>logback-classic</artifactId>
+ </exclusion>
</exclusions>
</dependency>
@@ -111,6 +179,12 @@
<build>
+ <resources>
+ <resource>
+ <directory>src/main/resources</directory>
+ </resource>
+ </resources>
+
<pluginManagement>
<plugins>
diff --git a/src/main/java/eu/mulk/aendggner/AendGgner.java b/src/main/java/eu/mulk/aendggner/AendGgner.java
index e510efb..c4efd45 100644
--- a/src/main/java/eu/mulk/aendggner/AendGgner.java
+++ b/src/main/java/eu/mulk/aendggner/AendGgner.java
@@ -1,13 +1,22 @@
package eu.mulk.aendggner;
+import java.io.BufferedReader;
import java.io.IOException;
+import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import java.util.concurrent.Callable;
+import java.util.logging.LogManager;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParsingReader;
+import org.jboss.logging.Logger;
+import org.xml.sax.SAXException;
import picocli.CommandLine;
import picocli.CommandLine.Command;
import picocli.CommandLine.Parameters;
@@ -19,6 +28,8 @@
description = "Displays German amendment acts in a user-friendly, consolidated way.")
public class AendGgner implements Callable<Integer> {
+ private static final Logger log = Logger.getLogger(AendGgner.class);
+
@Parameters(index = "0", description = "The base text to modify.")
private Path baseFile;
@@ -31,19 +42,49 @@
}
@Override
- public final Integer call() throws TikaException, IOException {
- var tika = new TikaConfig();
+ public final Integer call() throws TikaException, IOException, SAXException {
+ setupLogging();
+
+ log.debugf("Logging configured.");
+
+ TikaConfig tika;
+ try (var configResource =
+ this.getClass().getResourceAsStream("/eu/mulk/aendggner/tika-config.xml")) {
+ tika = new TikaConfig(configResource);
+ }
for (var file : patches) {
var metadata = new Metadata();
- metadata.set(Metadata.RESOURCE_NAME_KEY, file.toString());
+ metadata.set(Metadata.RESOURCE_NAME_KEY, file.getFileName().toString());
+
try (var is = TikaInputStream.get(file)) {
- var mimetype = tika.getDetector().detect(
- TikaInputStream.get(file), metadata);
- System.out.printf("File %s is %s.\n", file, mimetype);
+ var mimetype = tika.getDetector().detect(TikaInputStream.get(file), metadata);
+ log.infof("File %s is %s.", file, mimetype);
+ }
+
+ var parser = new AutoDetectParser(tika);
+ try (var in = Files.newInputStream(file);
+ var reader =
+ new BufferedReader(
+ new ParsingReader(parser, in, metadata, makeParseContext(parser)))) {
+ log.infof("%s: %d lines of text.", file, reader.lines().count());
+ // reader.lines().forEachOrdered(x -> log.infof("%s: %s", file, x));
}
}
return 0;
}
+
+ private static ParseContext makeParseContext(Parser parser) {
+ var parseContext = new ParseContext();
+ parseContext.set(Parser.class, parser);
+ return parseContext;
+ }
+
+ private static void setupLogging() throws IOException {
+ try (var loggingProperties =
+ AendGgner.class.getResourceAsStream("/eu/mulk/aendggner/logging.properties")) {
+ LogManager.getLogManager().readConfiguration(loggingProperties);
+ }
+ }
}
diff --git a/src/main/java/module-info.java b/src/main/java/module-info.java
deleted file mode 100644
index dbe7d62..0000000
--- a/src/main/java/module-info.java
+++ /dev/null
@@ -1,6 +0,0 @@
-module aendggner {
- requires info.picocli;
- requires org.apache.tika.core;
- requires java.sql;
- opens eu.mulk.aendggner;
-}
diff --git a/src/main/resources/eu/mulk/aendggner/logging.properties b/src/main/resources/eu/mulk/aendggner/logging.properties
new file mode 100644
index 0000000..cdebd0e
--- /dev/null
+++ b/src/main/resources/eu/mulk/aendggner/logging.properties
@@ -0,0 +1,17 @@
+#
+# General settings
+#
+handlers=java.util.logging.ConsoleHandler
+java.util.logging.SimpleFormatter.format=%1$tY-%1$tm-%1$td %1$tH:%1$tM:%1$tS %4$-11s %2$-40s %5$s %6$s%n
+java.util.logging.ConsoleHandler.level=ALL
+.level=INFO
+#
+# Application
+#
+eu.mulk.level=ALL
+#
+# 3rd-party libraries
+#
+org.apache.level=WARNING
+org.apache.tika.config.level=SEVERE
+org.apache.pdfbox.pdmodel.graphics.color.PDICCBased.level=SEVERE
diff --git a/src/main/resources/eu/mulk/aendggner/tika-config.xml b/src/main/resources/eu/mulk/aendggner/tika-config.xml
new file mode 100644
index 0000000..7c7983c
--- /dev/null
+++ b/src/main/resources/eu/mulk/aendggner/tika-config.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<properties>
+ <service-loader dynamic="true" loadErrorHandler="IGNORE"/>
+
+ <encodingDetectors>
+ <encodingDetector class="org.apache.tika.detect.DefaultEncodingDetector"/>
+ </encodingDetectors>
+
+ <translator class="org.apache.tika.language.translate.DefaultTranslator"/>
+
+ <detectors>
+ <detector class="org.apache.tika.detect.DefaultDetector"/>
+ </detectors>
+
+ <parsers>
+
+ <parser class="org.apache.tika.parser.DefaultParser">
+ <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
+ <parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/>
+ </parser>
+
+ <parser class="org.apache.tika.parser.pdf.PDFParser">
+ <params>
+ <param name="extractInlineImages" type="bool">true</param>
+ <param name="ocrStrategy" type="string">auto</param>
+ </params>
+ </parser>
+
+ <parser class="org.apache.tika.parser.ocr.TesseractOCRParser"/>
+
+ </parsers>
+</properties>