Configure Tika correctly.

With the new configuration, Tika can now extract text from PDFs and
XML documents.

Also configures logging for the application.

Change-Id: I7a89c2b232ed4e220665dd335a5f5a0cc3ef2994
diff --git a/pom.xml b/pom.xml
index 2bc4582..3546067 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1,6 +1,8 @@
 <?xml version="1.0"?>
-<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" xmlns="http://maven.apache.org/POM/4.0.0"
-    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+<project
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"
+  xmlns="http://maven.apache.org/POM/4.0.0"
+  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
 
   <modelVersion>4.0.0</modelVersion>
 
@@ -30,13 +32,19 @@
     <basic-annotations.version>0.2.0</basic-annotations.version>
     <findbugs-jsr305.version>3.0.2</findbugs-jsr305.version>
     <google.java.format.version>1.8</google.java.format.version>
-    <itext7.version>7.1.13</itext7.version>
     <kotlin-annotations.version>1.4.10</kotlin-annotations.version>
     <lanterna.version>3.0.4</lanterna.version>
     <picocli.version>4.3.2</picocli.version>
     <pdfocr.version>1.0.2</pdfocr.version>
     <term4j.version>0.4.0</term4j.version>
-    <tika.version>1.24.1</tika.version>
+    <tess4j.version>4.5.1</tess4j.version>
+    <tika.version>1.25</tika.version>
+    <jbig2-imageio.version>3.0.3</jbig2-imageio.version>
+    <jai-imageio-core.version>1.4.0</jai-imageio-core.version>
+    <jai-imageio-jpeg2000.version>1.3.0</jai-imageio-jpeg2000.version>
+    <sqlite-jdbc.version>3.30.1</sqlite-jdbc.version>
+    <jboss-logging.version>3.4.1.Final</jboss-logging.version>
+    <slf4j.version>1.7.30</slf4j.version>
   </properties>
 
   <dependencies>
@@ -68,6 +76,13 @@
       <version>${picocli.version}</version>
     </dependency>
 
+    <!-- Logging -->
+    <dependency>
+      <groupId>org.jboss.logging</groupId>
+      <artifactId>jboss-logging</artifactId>
+      <version>${jboss-logging.version}</version>
+    </dependency>
+
     <!-- Document reading -->
     <dependency>
       <groupId>org.apache.tika</groupId>
@@ -80,9 +95,54 @@
       <version>${tika.version}</version>
     </dependency>
     <dependency>
+      <groupId>org.apache.tika</groupId>
+      <artifactId>tika-parsers</artifactId>
+      <version>${tika.version}</version>
+      <exclusions>
+        <exclusion>
+          <groupId>org.slf4j</groupId>
+          <artifactId>jcl-over-slf4j</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.slf4j</groupId>
+          <artifactId>jul-to-slf4j</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.pdfbox</groupId>
+      <artifactId>jbig2-imageio</artifactId>
+      <version>${jbig2-imageio.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>com.github.jai-imageio</groupId>
+      <artifactId>jai-imageio-core</artifactId>
+      <version>${jai-imageio-core.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>com.github.jai-imageio</groupId>
+      <artifactId>jai-imageio-jpeg2000</artifactId>
+      <version>${jai-imageio-jpeg2000.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.xerial</groupId>
+      <artifactId>sqlite-jdbc</artifactId>
+      <version>${sqlite-jdbc.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-jdk14</artifactId>
+      <version>${slf4j.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>log4j-over-slf4j</artifactId>
+      <version>${slf4j.version}</version>
+    </dependency>
+    <dependency>
       <groupId>net.sourceforge.tess4j</groupId>
       <artifactId>tess4j</artifactId>
-      <version>4.5.1</version>
+      <version>${tess4j.version}</version>
       <exclusions>
         <exclusion>
           <groupId>org.slf4j</groupId>
@@ -92,6 +152,14 @@
           <groupId>org.slf4j</groupId>
           <artifactId>log4j-over-slf4j</artifactId>
         </exclusion>
+        <exclusion>
+          <groupId>org.slf4j</groupId>
+          <artifactId>jul-to-slf4j</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>ch.qos.logback</groupId>
+          <artifactId>logback-classic</artifactId>
+        </exclusion>
       </exclusions>
     </dependency>
 
@@ -111,6 +179,12 @@
 
   <build>
 
+    <resources>
+      <resource>
+        <directory>src/main/resources</directory>
+      </resource>
+    </resources>
+
     <pluginManagement>
       <plugins>
 
diff --git a/src/main/java/eu/mulk/aendggner/AendGgner.java b/src/main/java/eu/mulk/aendggner/AendGgner.java
index e510efb..c4efd45 100644
--- a/src/main/java/eu/mulk/aendggner/AendGgner.java
+++ b/src/main/java/eu/mulk/aendggner/AendGgner.java
@@ -1,13 +1,22 @@
 package eu.mulk.aendggner;
 
+import java.io.BufferedReader;
 import java.io.IOException;
+import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.List;
 import java.util.concurrent.Callable;
+import java.util.logging.LogManager;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParsingReader;
+import org.jboss.logging.Logger;
+import org.xml.sax.SAXException;
 import picocli.CommandLine;
 import picocli.CommandLine.Command;
 import picocli.CommandLine.Parameters;
@@ -19,6 +28,8 @@
     description = "Displays German amendment acts in a user-friendly, consolidated way.")
 public class AendGgner implements Callable<Integer> {
 
+  private static final Logger log = Logger.getLogger(AendGgner.class);
+
   @Parameters(index = "0", description = "The base text to modify.")
   private Path baseFile;
 
@@ -31,19 +42,49 @@
   }
 
   @Override
-  public final Integer call() throws TikaException, IOException {
-    var tika = new TikaConfig();
+  public final Integer call() throws TikaException, IOException, SAXException {
+    setupLogging();
+
+    log.debugf("Logging configured.");
+
+    TikaConfig tika;
+    try (var configResource =
+        this.getClass().getResourceAsStream("/eu/mulk/aendggner/tika-config.xml")) {
+      tika = new TikaConfig(configResource);
+    }
 
     for (var file : patches) {
       var metadata = new Metadata();
-      metadata.set(Metadata.RESOURCE_NAME_KEY, file.toString());
+      metadata.set(Metadata.RESOURCE_NAME_KEY, file.getFileName().toString());
+
       try (var is = TikaInputStream.get(file)) {
-        var mimetype = tika.getDetector().detect(
-            TikaInputStream.get(file), metadata);
-        System.out.printf("File %s is %s.\n", file, mimetype);
+        var mimetype = tika.getDetector().detect(TikaInputStream.get(file), metadata);
+        log.infof("File %s is %s.", file, mimetype);
+      }
+
+      var parser = new AutoDetectParser(tika);
+      try (var in = Files.newInputStream(file);
+          var reader =
+              new BufferedReader(
+                  new ParsingReader(parser, in, metadata, makeParseContext(parser)))) {
+        log.infof("%s: %d lines of text.", file, reader.lines().count());
+        // reader.lines().forEachOrdered(x -> log.infof("%s: %s", file, x));
       }
     }
 
     return 0;
   }
+
+  private static ParseContext makeParseContext(Parser parser) {
+    var parseContext = new ParseContext();
+    parseContext.set(Parser.class, parser);
+    return parseContext;
+  }
+
+  private static void setupLogging() throws IOException {
+    try (var loggingProperties =
+        AendGgner.class.getResourceAsStream("/eu/mulk/aendggner/logging.properties")) {
+      LogManager.getLogManager().readConfiguration(loggingProperties);
+    }
+  }
 }
diff --git a/src/main/java/module-info.java b/src/main/java/module-info.java
deleted file mode 100644
index dbe7d62..0000000
--- a/src/main/java/module-info.java
+++ /dev/null
@@ -1,6 +0,0 @@
-module aendggner {
-  requires info.picocli;
-  requires org.apache.tika.core;
-  requires java.sql;
-  opens eu.mulk.aendggner;
-}
diff --git a/src/main/resources/eu/mulk/aendggner/logging.properties b/src/main/resources/eu/mulk/aendggner/logging.properties
new file mode 100644
index 0000000..cdebd0e
--- /dev/null
+++ b/src/main/resources/eu/mulk/aendggner/logging.properties
@@ -0,0 +1,17 @@
+#
+# General settings
+#
+handlers=java.util.logging.ConsoleHandler
+java.util.logging.SimpleFormatter.format=%1$tY-%1$tm-%1$td %1$tH:%1$tM:%1$tS  %4$-11s  %2$-40s  %5$s %6$s%n
+java.util.logging.ConsoleHandler.level=ALL
+.level=INFO
+#
+# Application
+#
+eu.mulk.level=ALL
+#
+# 3rd-party libraries
+#
+org.apache.level=WARNING
+org.apache.tika.config.level=SEVERE
+org.apache.pdfbox.pdmodel.graphics.color.PDICCBased.level=SEVERE
diff --git a/src/main/resources/eu/mulk/aendggner/tika-config.xml b/src/main/resources/eu/mulk/aendggner/tika-config.xml
new file mode 100644
index 0000000..7c7983c
--- /dev/null
+++ b/src/main/resources/eu/mulk/aendggner/tika-config.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<properties>
+  <service-loader dynamic="true" loadErrorHandler="IGNORE"/>
+
+  <encodingDetectors>
+    <encodingDetector class="org.apache.tika.detect.DefaultEncodingDetector"/>
+  </encodingDetectors>
+
+  <translator class="org.apache.tika.language.translate.DefaultTranslator"/>
+
+  <detectors>
+    <detector class="org.apache.tika.detect.DefaultDetector"/>
+  </detectors>
+
+  <parsers>
+
+    <parser class="org.apache.tika.parser.DefaultParser">
+      <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
+      <parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/>
+    </parser>
+
+    <parser class="org.apache.tika.parser.pdf.PDFParser">
+      <params>
+        <param name="extractInlineImages" type="bool">true</param>
+        <param name="ocrStrategy" type="string">auto</param>
+      </params>
+    </parser>
+
+    <parser class="org.apache.tika.parser.ocr.TesseractOCRParser"/>
+
+  </parsers>
+</properties>