From 40888d17003acfbd94cc429cc1c1f63780b9665e Mon Sep 17 00:00:00 2001
From: Mario Molina <mmolimar@gmail.com>
Date: Wed, 20 May 2020 08:37:46 -0500
Subject: [PATCH 01/19] Prepare for next version

---
 docker-compose.yml | 2 +-
 pom.xml            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/docker-compose.yml b/docker-compose.yml
index e763372..dd07fd5 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -45,7 +45,7 @@ services:
       SCHEMA_REGISTRY_KAFKASTORE_CONNECTION_URL: 'zookeeper:2181'
 
   connect-fs:
-    image: mmolimar/kafka-connect-fs:1.0.0
+    image: mmolimar/kafka-connect-fs:1.1.0-SNAPSHOT
     container_name: connect
     depends_on:
       - cp-kafka
diff --git a/pom.xml b/pom.xml
index bd22791..bfe7448 100644
--- a/pom.xml
+++ b/pom.xml
@@ -4,7 +4,7 @@
 
     <groupId>com.github.mmolimar.kafka.connect</groupId>
     <artifactId>kafka-connect-fs</artifactId>
-    <version>1.0.0</version>
+    <version>1.1.0-SNAPSHOT</version>
     <packaging>jar</packaging>
 
     <name>kafka-connect-fs</name>

From 559ec4b304ac893683a1e1bfc8caee8b5ffcde09 Mon Sep 17 00:00:00 2001
From: Scotty Pate <scottypate@me.com>
Date: Fri, 5 Jun 2020 07:38:09 -0500
Subject: [PATCH 02/19] documentation updates (#64)

Fix the text file reader section.
---
 docs/source/config_options.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/config_options.rst b/docs/source/config_options.rst
index 0a69105..f9cfab4 100644
--- a/docs/source/config_options.rst
+++ b/docs/source/config_options.rst
@@ -817,7 +817,7 @@ Text
 
 To configure custom properties for this reader, the name you must use is ``text``.
 
-``file_reader.json.record_per_line``
+``file_reader.text.record_per_line``
   If enabled, the reader will read each line as a record. Otherwise, the reader will read the full
   content of the file as a record.
 
@@ -839,14 +839,14 @@ To configure custom properties for this reader, the name you must use is ``text`
   * Default: based on the locale and charset of the underlying operating system.
   * Importance: medium
 
-``file_reader.json.compression.type``
+``file_reader.text.compression.type``
   Compression type to use when reading a file.
 
   * Type: enum (available values ``bzip2``, ``gzip`` and ``none``)
   * Default: ``none``
   * Importance: medium
 
-``file_reader.json.compression.concatenated``
+``file_reader.text.compression.concatenated``
   Flag to specify if the decompression of the reader will finish at the end of the file or after
   the first compressed stream.
 

From 7a35c9be3f58ea71dbe307e87e420f632a8fcc46 Mon Sep 17 00:00:00 2001
From: Miguel Alexandre <Symbianx@users.noreply.github.com>
Date: Thu, 11 Jun 2020 18:09:57 +0200
Subject: [PATCH 03/19] Add files batching capabilities to the Policies (#59)

* Add a simple batch policy for file systems with a lot of files

* Improve code test coverage

* Setup batching in the AbstractPolicy instead of a seperate Policy.

* Fix unit tests

* Fix unit tests by isolating newly created policies

* Rename iterator

* Use com.google.common.collect.Iterators.partition method instead

* Minor changes

Co-authored-by: Mario Molina <mmolimar@gmail.com>
---
 docs/source/config_options.rst                |  7 ++
 .../kafka/connect/fs/FsSourceTaskConfig.java  | 14 +++
 .../connect/fs/policy/AbstractPolicy.java     | 35 +++++--
 .../connect/fs/policy/CronPolicyTest.java     | 15 +--
 .../fs/policy/HdfsFileWatcherPolicyTest.java  | 19 ++--
 .../connect/fs/policy/PolicyTestBase.java     | 84 +++++++++++++----
 .../connect/fs/policy/SimplePolicyTest.java   | 49 ++++++++++
 .../connect/fs/policy/SleepyPolicyTest.java   | 33 +++----
 .../connect/fs/task/FsSourceTaskTest.java     | 94 +++++++++++++++++++
 9 files changed, 293 insertions(+), 57 deletions(-)

diff --git a/docs/source/config_options.rst b/docs/source/config_options.rst
index f9cfab4..47995e6 100644
--- a/docs/source/config_options.rst
+++ b/docs/source/config_options.rst
@@ -91,6 +91,13 @@ General config properties for this connector.
   * Default: ``false``
   * Importance: medium
 
+``policy.batch_size``
+  Number of files that should be handled at a time. Non-positive values disable batching.
+
+  * Type: int
+  * Default: ``0``
+  * Importance: medium
+
 ``policy.<policy_name>.<policy_property>``
   This represents custom properties you can include based on the policy class specified.
 
diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTaskConfig.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTaskConfig.java
index 58231fd..39278c3 100644
--- a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTaskConfig.java
+++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTaskConfig.java
@@ -21,6 +21,10 @@ public class FsSourceTaskConfig extends FsSourceConnectorConfig {
     private static final String POLICY_REGEXP_DOC = "Regular expression to filter files from the FS.";
     private static final String POLICY_REGEXP_DISPLAY = "File filter regex";
 
+    public static final String POLICY_BATCH_SIZE = POLICY_PREFIX + "batch_size";
+    private static final String POLICY_BATCH_SIZE_DOC = "Number of files to process at a time. Non-positive values disable batching.";
+    private static final String POLICY_BATCH_SIZE_DISPLAY = "Files per batch";
+
     public static final String POLICY_PREFIX_FS = POLICY_PREFIX + "fs.";
 
     public static final String FILE_READER_CLASS = FILE_READER_PREFIX + "class";
@@ -76,6 +80,16 @@ public static ConfigDef conf() {
                         ++order,
                         ConfigDef.Width.MEDIUM,
                         POLICY_REGEXP_DISPLAY
+                ).define(
+                        POLICY_BATCH_SIZE,
+                        ConfigDef.Type.INT,
+                        0,
+                        ConfigDef.Importance.MEDIUM,
+                        POLICY_BATCH_SIZE_DOC,
+                        POLICY_GROUP,
+                        ++order,
+                        ConfigDef.Width.MEDIUM,
+                        POLICY_BATCH_SIZE_DISPLAY
                 ).define(
                         FILE_READER_CLASS,
                         ConfigDef.Type.CLASS,
diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java
index d250e76..a2da257 100644
--- a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java
+++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java
@@ -5,6 +5,8 @@
 import com.github.mmolimar.kafka.connect.fs.file.reader.FileReader;
 import com.github.mmolimar.kafka.connect.fs.util.ReflectionUtils;
 import com.github.mmolimar.kafka.connect.fs.util.TailCall;
+import com.google.common.collect.Iterators;
+
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.LocatedFileStatus;
@@ -36,6 +38,8 @@ abstract class AbstractPolicy implements Policy {
     private final FsSourceTaskConfig conf;
     private final AtomicLong executions;
     private final boolean recursive;
+    private final int batchSize;
+    private Iterator<List<FileMetadata>> previous;
     private boolean interrupted;
 
     public AbstractPolicy(FsSourceTaskConfig conf) throws IOException {
@@ -44,7 +48,9 @@ public AbstractPolicy(FsSourceTaskConfig conf) throws IOException {
         this.executions = new AtomicLong(0);
         this.recursive = conf.getBoolean(FsSourceTaskConfig.POLICY_RECURSIVE);
         this.fileRegexp = Pattern.compile(conf.getString(FsSourceTaskConfig.POLICY_REGEXP));
+        this.batchSize = conf.getInt(FsSourceTaskConfig.POLICY_BATCH_SIZE);
         this.interrupted = false;
+        this.previous = Collections.emptyIterator();
 
         Map<String, Object> customConfigs = customConfigs();
         logAll(customConfigs);
@@ -105,6 +111,11 @@ public final Iterator<FileMetadata> execute() throws IOException {
         if (hasEnded()) {
             throw new IllegalWorkerStateException("Policy has ended. Cannot be retried.");
         }
+
+        if (batchSize > 0 && previous.hasNext()) {
+            return previous.next().iterator();
+        }
+
         preCheck();
 
         executions.incrementAndGet();
@@ -112,9 +123,15 @@ public final Iterator<FileMetadata> execute() throws IOException {
         for (FileSystem fs : fileSystems) {
             files = concat(files, listFiles(fs));
         }
-
         postCheck();
 
+        if (batchSize > 0) {
+            previous = Iterators.partition(files, batchSize);
+            if (!previous.hasNext())
+                return Collections.emptyIterator();
+            return previous.next().iterator();
+        }
+
         return files;
     }
 
@@ -143,8 +160,7 @@ private TailCall<Boolean> hasNextRec() {
                         current = it.next();
                         return this::hasNextRec;
                     }
-                    if (current.isFile() &
-                            fileRegexp.matcher(current.getPath().getName()).find()) {
+                    if (current.isFile() && fileRegexp.matcher(current.getPath().getName()).find()) {
                         return TailCall.done(true);
                     }
                     current = null;
@@ -173,7 +189,11 @@ public FileMetadata next() {
 
     @Override
     public final boolean hasEnded() {
-        return interrupted || isPolicyCompleted();
+        if (interrupted) {
+            return true;
+        }
+
+        return !previous.hasNext() && isPolicyCompleted();
     }
 
     protected abstract boolean isPolicyCompleted();
@@ -200,7 +220,9 @@ public FileReader offer(FileMetadata metadata, OffsetStorageReader offsetStorage
         try {
             FileReader reader = ReflectionUtils.makeReader(
                     (Class<? extends FileReader>) conf.getClass(FsSourceTaskConfig.FILE_READER_CLASS),
-                    current, new Path(metadata.getPath()), conf.originals());
+                    current,
+                    new Path(metadata.getPath()), conf.originals()
+            );
             Map<String, Object> partition = Collections.singletonMap("path", metadata.getPath());
             Map<String, Object> offset = offsetStorageReader.offset(partition);
             if (offset != null && offset.get("offset") != null) {
@@ -213,8 +235,7 @@ public FileReader offer(FileMetadata metadata, OffsetStorageReader offsetStorage
         }
     }
 
-    private Iterator<FileMetadata> concat(final Iterator<FileMetadata> it1,
-                                          final Iterator<FileMetadata> it2) {
+    private Iterator<FileMetadata> concat(final Iterator<FileMetadata> it1, final Iterator<FileMetadata> it2) {
         return new Iterator<FileMetadata>() {
 
             @Override
diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/CronPolicyTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/CronPolicyTest.java
index 72bac98..ac5e264 100644
--- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/CronPolicyTest.java
+++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/CronPolicyTest.java
@@ -89,15 +89,16 @@ public void invalidEndDate(PolicyFsTestConfig fsConfig) {
     @ParameterizedTest
     @MethodSource("fileSystemConfigProvider")
     public void canBeInterrupted(PolicyFsTestConfig fsConfig) throws IOException {
-        Policy policy = ReflectionUtils.makePolicy((Class<? extends Policy>) fsConfig.getSourceTaskConfig()
+        try (Policy policy = ReflectionUtils.makePolicy((Class<? extends Policy>) fsConfig.getSourceTaskConfig()
                         .getClass(FsSourceTaskConfig.POLICY_CLASS),
-                fsConfig.getSourceTaskConfig());
+                fsConfig.getSourceTaskConfig())) {
 
-        for (int i = 0; i < 5; i++) {
-            assertFalse(policy.hasEnded());
-            policy.execute();
+            for (int i = 0; i < 5; i++) {
+                assertFalse(policy.hasEnded());
+                policy.execute();
+            }
+            policy.interrupt();
+            assertTrue(policy.hasEnded());
         }
-        policy.interrupt();
-        assertTrue(policy.hasEnded());
     }
 }
diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/HdfsFileWatcherPolicyTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/HdfsFileWatcherPolicyTest.java
index a29ae5d..3cd9c75 100644
--- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/HdfsFileWatcherPolicyTest.java
+++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/HdfsFileWatcherPolicyTest.java
@@ -81,21 +81,22 @@ public void execPolicyAlreadyEnded(PolicyFsTestConfig fsConfig) throws IOExcepti
 
     @ParameterizedTest
     @MethodSource("fileSystemConfigProvider")
-    public void notReachableFileSystem(PolicyFsTestConfig fsConfig) throws InterruptedException {
+    public void notReachableFileSystem(PolicyFsTestConfig fsConfig) throws InterruptedException, IOException {
         Map<String, String> originals = fsConfig.getSourceTaskConfig().originalsStrings();
         originals.put(FsSourceTaskConfig.FS_URIS, "hdfs://localhost:65432/data");
         originals.put(HdfsFileWatcherPolicy.HDFS_FILE_WATCHER_POLICY_POLL_MS, "0");
         originals.put(HdfsFileWatcherPolicy.HDFS_FILE_WATCHER_POLICY_RETRY_MS, "0");
         FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals);
-        Policy policy = ReflectionUtils.makePolicy((Class<? extends Policy>) fsConfig.getSourceTaskConfig()
-                .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg);
-        int count = 0;
-        while (!policy.hasEnded() && count < 10) {
-            Thread.sleep(500);
-            count++;
+        try(Policy policy = ReflectionUtils.makePolicy((Class<? extends Policy>) fsConfig.getSourceTaskConfig()
+                .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)) {
+            int count = 0;
+            while (!policy.hasEnded() && count < 10) {
+                Thread.sleep(500);
+                count++;
+            }
+            assertTrue(count < 10);
+            assertTrue(policy.hasEnded());
         }
-        assertTrue(count < 10);
-        assertTrue(policy.hasEnded());
     }
 
     @ParameterizedTest
diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java
index ba77775..35fb5e9 100644
--- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java
+++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java
@@ -180,24 +180,25 @@ public void dynamicURIs(PolicyFsTestConfig fsConfig) throws IOException {
         Map<String, String> originals = fsConfig.getSourceTaskConfig().originalsStrings();
         originals.put(FsSourceTaskConfig.FS_URIS, dynamic.toString());
         FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals);
-        Policy policy = ReflectionUtils.makePolicy((Class<? extends Policy>) fsConfig.getSourceTaskConfig()
-                .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg);
-        fsConfig.setPolicy(policy);
-        assertEquals(1, fsConfig.getPolicy().getURIs().size());
-
-        LocalDateTime dateTime = LocalDateTime.now();
-        DateTimeFormatter formatter = DateTimeFormatter.ofPattern("G");
-        StringBuilder uri = new StringBuilder(dateTime.format(formatter));
-        uri.append("/");
-        formatter = DateTimeFormatter.ofPattern("yyyy");
-        uri.append(dateTime.format(formatter));
-        uri.append("/");
-        formatter = DateTimeFormatter.ofPattern("MM");
-        uri.append(dateTime.format(formatter));
-        uri.append("/");
-        formatter = DateTimeFormatter.ofPattern("W");
-        uri.append(dateTime.format(formatter));
-        assertTrue(fsConfig.getPolicy().getURIs().get(0).endsWith(uri.toString()));
+        try (Policy policy = ReflectionUtils.makePolicy((Class<? extends Policy>) fsConfig.getSourceTaskConfig()
+                .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)) {
+
+            assertEquals(1, policy.getURIs().size());
+
+            LocalDateTime dateTime = LocalDateTime.now();
+            DateTimeFormatter formatter = DateTimeFormatter.ofPattern("G");
+            StringBuilder uri = new StringBuilder(dateTime.format(formatter));
+            uri.append("/");
+            formatter = DateTimeFormatter.ofPattern("yyyy");
+            uri.append(dateTime.format(formatter));
+            uri.append("/");
+            formatter = DateTimeFormatter.ofPattern("MM");
+            uri.append(dateTime.format(formatter));
+            uri.append("/");
+            formatter = DateTimeFormatter.ofPattern("W");
+            uri.append(dateTime.format(formatter));
+            assertTrue(policy.getURIs().get(0).endsWith(uri.toString()));
+        }
     }
 
     @ParameterizedTest
@@ -221,6 +222,53 @@ public void invalidDynamicURIs(PolicyFsTestConfig fsConfig) throws IOException {
         });
     }
 
+    @ParameterizedTest
+    @MethodSource("fileSystemConfigProvider")
+    public void execPolicyBatchesFiles(PolicyFsTestConfig fsConfig) throws IOException, InterruptedException {
+        Map<String, String> originals = fsConfig.getSourceTaskConfig().originalsStrings();
+        originals.put(FsSourceTaskConfig.POLICY_BATCH_SIZE, "1");
+        FsSourceTaskConfig sourceTaskConfig = new FsSourceTaskConfig(originals);
+
+        try (Policy policy = ReflectionUtils.makePolicy(
+                (Class<? extends Policy>) fsConfig.getSourceTaskConfig().getClass(FsSourceTaskConfig.POLICY_CLASS),
+                sourceTaskConfig)) {
+
+            FileSystem fs = fsConfig.getFs();
+            for (Path dir : fsConfig.getDirectories()) {
+                fs.createNewFile(new Path(dir, System.nanoTime() + ".txt"));
+                //this file does not match the regexp
+                fs.createNewFile(new Path(dir, System.nanoTime() + ".invalid"));
+
+                //we wait till FS has registered the files
+                Thread.sleep(3000);
+            }
+            
+            Iterator<FileMetadata> it = policy.execute();
+
+            // First batch of files (1 file)
+            assertTrue(it.hasNext());
+            String firstPath = it.next().getPath();
+            assertFalse(it.hasNext());
+
+            // Second batch of files (1 file)
+            it = policy.execute();
+            assertTrue(it.hasNext());
+            assertNotEquals(firstPath, it.next().getPath());
+            assertFalse(it.hasNext());
+        }
+    }
+
+    @ParameterizedTest
+    @MethodSource("fileSystemConfigProvider")
+    public void invalidBatchSize(PolicyFsTestConfig fsConfig) {
+        Map<String, String> originals = fsConfig.getSourceTaskConfig().originalsStrings();
+        originals.put(FsSourceTaskConfig.POLICY_BATCH_SIZE, "one");
+        assertThrows(ConfigException.class, () -> {
+            new FsSourceTaskConfig(originals);
+        });
+
+    }
+
     protected abstract FsSourceTaskConfig buildSourceTaskConfig(List<Path> directories);
 
 }
diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/SimplePolicyTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/SimplePolicyTest.java
index 279a775..42bfe7e 100644
--- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/SimplePolicyTest.java
+++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/SimplePolicyTest.java
@@ -1,10 +1,20 @@
 package com.github.mmolimar.kafka.connect.fs.policy;
 
 import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig;
+import com.github.mmolimar.kafka.connect.fs.file.FileMetadata;
 import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader;
+import com.github.mmolimar.kafka.connect.fs.util.ReflectionUtils;
+
+import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.MethodSource;
+
+import static org.junit.jupiter.api.Assertions.*;
 
+import java.io.IOException;
 import java.util.HashMap;
+import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 
@@ -26,4 +36,43 @@ protected FsSourceTaskConfig buildSourceTaskConfig(List<Path> directories) {
         return new FsSourceTaskConfig(cfg);
     }
 
+    @ParameterizedTest
+    @MethodSource("fileSystemConfigProvider")
+    public void execPolicyEndsAfterBatching(PolicyFsTestConfig fsConfig) throws IOException, InterruptedException {
+        Map<String, String> originals = fsConfig.getSourceTaskConfig().originalsStrings();
+        originals.put(FsSourceTaskConfig.POLICY_BATCH_SIZE, "1");
+        FsSourceTaskConfig sourceTaskConfig = new FsSourceTaskConfig(originals);
+
+        try (Policy policy = ReflectionUtils.makePolicy(
+                (Class<? extends Policy>) fsConfig.getSourceTaskConfig().getClass(FsSourceTaskConfig.POLICY_CLASS),
+                sourceTaskConfig)) {
+
+            FileSystem fs = fsConfig.getFs();
+            for (Path dir : fsConfig.getDirectories()) {
+                fs.createNewFile(new Path(dir, System.nanoTime() + ".txt"));
+                //this file does not match the regexp
+                fs.createNewFile(new Path(dir, System.nanoTime() + ".invalid"));
+
+                //we wait till FS has registered the files
+                Thread.sleep(3000);
+            }
+            
+            Iterator<FileMetadata> it = policy.execute();
+
+            // First batch of files (1 file)
+            assertFalse(policy.hasEnded());
+            assertTrue(it.hasNext());
+            String firstPath = it.next().getPath();
+            assertFalse(it.hasNext());
+            assertFalse(policy.hasEnded());
+
+            // Second batch of files (1 file)
+            it = policy.execute();
+            assertTrue(it.hasNext());
+            assertNotEquals(firstPath, it.next().getPath());
+            assertFalse(it.hasNext());
+            assertTrue(policy.hasEnded());
+        }
+    }
+
 }
diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/SleepyPolicyTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/SleepyPolicyTest.java
index 65c41c7..09e592c 100644
--- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/SleepyPolicyTest.java
+++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/SleepyPolicyTest.java
@@ -101,13 +101,14 @@ public void sleepExecution(PolicyFsTestConfig fsConfig) throws IOException {
         tConfig.put(SleepyPolicy.SLEEPY_POLICY_MAX_EXECS, "2");
         FsSourceTaskConfig sleepConfig = new FsSourceTaskConfig(tConfig);
 
-        Policy policy = ReflectionUtils.makePolicy((Class<? extends Policy>) fsConfig.getSourceTaskConfig()
-                .getClass(FsSourceTaskConfig.POLICY_CLASS), sleepConfig);
-        assertFalse(policy.hasEnded());
-        policy.execute();
-        assertFalse(policy.hasEnded());
-        policy.execute();
-        assertTrue(policy.hasEnded());
+        try (Policy policy = ReflectionUtils.makePolicy((Class<? extends Policy>) fsConfig.getSourceTaskConfig()
+                .getClass(FsSourceTaskConfig.POLICY_CLASS), sleepConfig)) {
+            assertFalse(policy.hasEnded());
+            policy.execute();
+            assertFalse(policy.hasEnded());
+            policy.execute();
+            assertTrue(policy.hasEnded());
+        }
     }
 
     @ParameterizedTest
@@ -118,16 +119,16 @@ public void defaultExecutions(PolicyFsTestConfig fsConfig) throws IOException {
         tConfig.remove(SleepyPolicy.SLEEPY_POLICY_MAX_EXECS);
         FsSourceTaskConfig sleepConfig = new FsSourceTaskConfig(tConfig);
 
-        Policy policy = ReflectionUtils.makePolicy((Class<? extends Policy>) fsConfig.getSourceTaskConfig()
-                .getClass(FsSourceTaskConfig.POLICY_CLASS), sleepConfig);
-
-        //it never ends
-        for (int i = 0; i < 100; i++) {
-            assertFalse(policy.hasEnded());
-            policy.execute();
+        try (Policy policy = ReflectionUtils.makePolicy((Class<? extends Policy>) fsConfig.getSourceTaskConfig()
+                .getClass(FsSourceTaskConfig.POLICY_CLASS), sleepConfig)) {
+            //it never ends
+            for (int i = 0; i < 100; i++) {
+                assertFalse(policy.hasEnded());
+                policy.execute();
+            }
+            policy.interrupt();
+            assertTrue(policy.hasEnded());
         }
-        policy.interrupt();
-        assertTrue(policy.hasEnded());
     }
 
 }
diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTest.java
index b4b5a4e..78194f2 100644
--- a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTest.java
+++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTest.java
@@ -28,6 +28,7 @@
 import java.io.FileWriter;
 import java.io.IOException;
 import java.io.PrintWriter;
+import java.time.Duration;
 import java.util.*;
 import java.util.stream.IntStream;
 import java.util.stream.Stream;
@@ -270,6 +271,99 @@ public void checkVersion(TaskFsTestConfig fsConfig) {
         assertFalse("unknown".equalsIgnoreCase(fsConfig.getTask().version()));
     }
 
+
+    @ParameterizedTest
+    @MethodSource("fileSystemConfigProvider")
+    public void pollNoDataWithBatch(TaskFsTestConfig fsConfig) {
+        Map<String, String> props = new HashMap<>(fsConfig.getTaskConfig());
+        props.put(FsSourceTaskConfig.POLICY_BATCH_SIZE, "1");
+        fsConfig.getTask().start(props);
+        
+        assertEquals(0, fsConfig.getTask().poll().size());
+        //policy has ended
+        assertNull(fsConfig.getTask().poll());
+    }
+
+    @ParameterizedTest
+    @MethodSource("fileSystemConfigProvider")
+    public void emptyFilesToProcessWithBatch(TaskFsTestConfig fsConfig) throws IOException {
+        for (Path dir : fsConfig.getDirectories()) {
+            fsConfig.getFs().createNewFile(new Path(dir, System.nanoTime() + ".txt"));
+            //this file does not match the regexp
+            fsConfig.getFs().createNewFile(new Path(dir, String.valueOf(System.nanoTime())));
+        }
+        Map<String, String> props = new HashMap<>(fsConfig.getTaskConfig());
+        props.put(FsSourceTaskConfig.POLICY_BATCH_SIZE, "1");
+        fsConfig.getTask().start(props);
+
+        List<SourceRecord> records = new ArrayList<>();
+        List<SourceRecord> fresh = fsConfig.getTask().poll();
+        while (fresh != null) {
+            records.addAll(fresh);
+            fresh = fsConfig.getTask().poll();
+        }
+        assertEquals(0, records.size());
+
+        //policy has ended
+        assertNull(fsConfig.getTask().poll());
+    }
+
+    @ParameterizedTest
+    @MethodSource("fileSystemConfigProvider")
+    public void oneFilePerFsWithBatch(TaskFsTestConfig fsConfig) throws IOException {
+        for (Path dir : fsConfig.getDirectories()) {
+            Path dataFile = new Path(dir, System.nanoTime() + ".txt");
+            createDataFile(fsConfig.getFs(), dataFile);
+            //this file does not match the regexp
+            fsConfig.getFs().createNewFile(new Path(dir, String.valueOf(System.nanoTime())));
+        }
+
+        Map<String, String> props = new HashMap<>(fsConfig.getTaskConfig());
+        props.put(FsSourceTaskConfig.POLICY_BATCH_SIZE, "1");
+        fsConfig.getTask().start(props);
+        
+        List<SourceRecord> records = new ArrayList<>();
+        List<SourceRecord> fresh = fsConfig.getTask().poll();
+        while (fresh != null) {
+            records.addAll(fresh);
+            fresh = fsConfig.getTask().poll();
+        }
+
+        assertEquals((NUM_RECORDS * fsConfig.getDirectories().size()) / 2, records.size());
+        checkRecords(records);
+        //policy has ended
+        assertNull(fsConfig.getTask().poll());
+    }
+
+    @ParameterizedTest
+    @MethodSource("fileSystemConfigProvider")
+    public void shouldNotSleepBetweenBatches(TaskFsTestConfig fsConfig) throws IOException {
+        Map<String, String> props = new HashMap<>(fsConfig.getTaskConfig());
+        props.put(FsSourceTaskConfig.POLL_INTERVAL_MS, "10000");
+        props.put(FsSourceTaskConfig.POLICY_BATCH_SIZE, "1");
+
+        for (Path dir : fsConfig.getDirectories()) {
+            Path dataFile = new Path(dir, System.nanoTime() + ".txt");
+            createDataFile(fsConfig.getFs(), dataFile);
+            //this file does not match the regexp
+            fsConfig.getFs().createNewFile(new Path(dir, String.valueOf(System.nanoTime())));
+        }
+
+        fsConfig.getTask().start(props);
+        
+        List<SourceRecord> records = new ArrayList<>();
+        assertTimeoutPreemptively(Duration.ofSeconds(2), () -> {
+            records.addAll(fsConfig.getTask().poll());
+            records.addAll(fsConfig.getTask().poll());    
+        });
+
+        assertEquals((NUM_RECORDS * fsConfig.getDirectories().size()) / 2, records.size());
+        checkRecords(records);
+        //policy has ended
+        assertNull(fsConfig.getTask().poll());
+    }
+
+
     protected void checkRecords(List<SourceRecord> records) {
         records.forEach(record -> {
             assertEquals("topic_test", record.topic());

From e60cef3bec76ef1d8835dc7379ac5f2472827d86 Mon Sep 17 00:00:00 2001
From: Grant Nicholas <43971820+grantatspothero@users.noreply.github.com>
Date: Sat, 20 Jun 2020 21:07:47 -0500
Subject: [PATCH 04/19] Feature/skip unchanged files (#62)

* Skip processing unchanged files

We store the fileSizeBytes as an offset in kafka connect and skip files whose file size has not changed

* Add tests

* Reduce log line length

* Add EmptyFileReader

* Update tests

* Minor changes

Co-authored-by: Mario Molina <mmolimar@gmail.com>
---
 .../kafka/connect/fs/FsSourceTask.java        |  5 +-
 .../fs/file/reader/EmptyFileReader.java       | 47 +++++++++++++
 .../connect/fs/policy/AbstractPolicy.java     | 32 ++++++---
 .../connect/fs/task/FsSourceTaskTest.java     | 66 ++++++++++++++-----
 4 files changed, 124 insertions(+), 26 deletions(-)
 create mode 100644 src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/EmptyFileReader.java

diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java
index bb2169a..425ae5e 100644
--- a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java
+++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java
@@ -122,9 +122,12 @@ private <T> Stream<T> asStream(Iterator<T> src) {
     }
 
     private SourceRecord convert(FileMetadata metadata, long offset, Struct struct) {
+        Map<String, Long> offsetMap = new HashMap<>();
+        offsetMap.put("offset", offset);
+        offsetMap.put("fileSizeBytes", metadata.getLen());
         return new SourceRecord(
                 Collections.singletonMap("path", metadata.getPath()),
-                Collections.singletonMap("offset", offset),
+                offsetMap,
                 config.getTopic(),
                 struct.schema(),
                 struct
diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/EmptyFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/EmptyFileReader.java
new file mode 100644
index 0000000..b6567c8
--- /dev/null
+++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/EmptyFileReader.java
@@ -0,0 +1,47 @@
+package com.github.mmolimar.kafka.connect.fs.file.reader;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+import java.util.Map;
+
+public class EmptyFileReader extends AbstractFileReader<Void> {
+    /*
+    An empty file reader that will always return no records
+    Used as a null object instead of returning null
+     */
+    private boolean closed;
+
+    public EmptyFileReader(FileSystem fs, Path filePath, Map<String, Object> config) {
+        super(fs, filePath, (Void record) -> null, config);
+        this.closed = false;
+    }
+
+    @Override
+    protected void configure(Map<String, String> config) {
+    }
+
+    @Override
+    protected Void nextRecord() {
+        return null;
+    }
+
+    @Override
+    protected boolean hasNextRecord() {
+        return false;
+    }
+
+    @Override
+    public void seekFile(long offset) {
+    }
+
+    @Override
+    public boolean isClosed() {
+        return this.closed;
+    }
+
+    @Override
+    public void close() {
+        closed = true;
+    }
+}
diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java
index a2da257..0781364 100644
--- a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java
+++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java
@@ -2,11 +2,11 @@
 
 import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig;
 import com.github.mmolimar.kafka.connect.fs.file.FileMetadata;
+import com.github.mmolimar.kafka.connect.fs.file.reader.EmptyFileReader;
 import com.github.mmolimar.kafka.connect.fs.file.reader.FileReader;
 import com.github.mmolimar.kafka.connect.fs.util.ReflectionUtils;
 import com.github.mmolimar.kafka.connect.fs.util.TailCall;
 import com.google.common.collect.Iterators;
-
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.LocatedFileStatus;
@@ -24,6 +24,7 @@
 import java.time.format.DateTimeFormatter;
 import java.util.*;
 import java.util.concurrent.atomic.AtomicLong;
+import java.util.function.Supplier;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import java.util.stream.Collectors;
@@ -217,19 +218,34 @@ public FileReader offer(FileMetadata metadata, OffsetStorageReader offsetStorage
                 .filter(fs -> metadata.getPath().startsWith(fs.getWorkingDirectory().toString()))
                 .findFirst()
                 .orElse(null);
+
+        Supplier<FileReader> makeReader = () -> ReflectionUtils.makeReader(
+                (Class<? extends FileReader>) conf.getClass(FsSourceTaskConfig.FILE_READER_CLASS),
+                current, new Path(metadata.getPath()), conf.originals()
+        );
         try {
-            FileReader reader = ReflectionUtils.makeReader(
-                    (Class<? extends FileReader>) conf.getClass(FsSourceTaskConfig.FILE_READER_CLASS),
-                    current,
-                    new Path(metadata.getPath()), conf.originals()
-            );
             Map<String, Object> partition = Collections.singletonMap("path", metadata.getPath());
             Map<String, Object> offset = offsetStorageReader.offset(partition);
             if (offset != null && offset.get("offset") != null) {
+                Object offsetObject = offset.get("offset");
+                Object fileSizeBytesObject = offset.get("fileSizeBytes");
+
+                // Only new versions of kafka-connect-fs store the file size bytes
+                // If we have the byte offset, we can skip reading the entire file if the file size has not changed
+                if (fileSizeBytesObject != null) {
+                    Long byteOffset = (Long) fileSizeBytesObject;
+                    if (metadata.getLen() == byteOffset) {
+                        log.info("Skipping file [{}] due to it was already processed.", metadata.getPath());
+                        return new EmptyFileReader(current, new Path(metadata.getPath()), conf.originals());
+                    }
+                }
+
                 log.info("Seeking to offset [{}] for file [{}].", offset.get("offset"), metadata.getPath());
-                reader.seek((Long) offset.get("offset"));
+                FileReader reader = makeReader.get();
+                reader.seek((Long) offsetObject);
+                return reader;
             }
-            return reader;
+            return makeReader.get();
         } catch (Exception e) {
             throw new ConnectException("An error has occurred when creating reader for file: " + metadata.getPath(), e);
         }
diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTest.java
index 78194f2..029576b 100644
--- a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTest.java
+++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTest.java
@@ -13,6 +13,8 @@
 import org.apache.kafka.connect.source.SourceRecord;
 import org.apache.kafka.connect.source.SourceTaskContext;
 import org.apache.kafka.connect.storage.OffsetStorageReader;
+import org.easymock.Capture;
+import org.easymock.CaptureType;
 import org.easymock.EasyMock;
 import org.junit.jupiter.api.AfterAll;
 import org.junit.jupiter.api.AfterEach;
@@ -30,6 +32,7 @@
 import java.io.PrintWriter;
 import java.time.Duration;
 import java.util.*;
+import java.util.concurrent.atomic.AtomicInteger;
 import java.util.stream.IntStream;
 import java.util.stream.Stream;
 
@@ -42,6 +45,8 @@ public class FsSourceTaskTest {
             new HdfsFsConfig()
     );
     private static final int NUM_RECORDS = 10;
+    private static final long NUM_BYTES_PER_FILE = 390;
+    private static final String FILE_ALREADY_PROCESSED = "already_processed.txt";
 
     @BeforeAll
     public static void initFs() throws IOException {
@@ -75,19 +80,31 @@ public void initTask() {
             OffsetStorageReader offsetStorageReader = PowerMock.createMock(OffsetStorageReader.class);
 
             EasyMock.expect(taskContext.offsetStorageReader())
-                    .andReturn(offsetStorageReader);
-
-            EasyMock.expect(taskContext.offsetStorageReader())
-                    .andReturn(offsetStorageReader);
-
-            EasyMock.expect(offsetStorageReader.offset(EasyMock.anyObject()))
-                    .andReturn(new HashMap<String, Object>() {{
-                        put("offset", (long) (NUM_RECORDS / 2));
-                    }});
-            EasyMock.expect(offsetStorageReader.offset(EasyMock.anyObject()))
-                    .andReturn(new HashMap<String, Object>() {{
-                        put("offset", (long) (NUM_RECORDS / 2));
-                    }});
+                    .andReturn(offsetStorageReader)
+                    .times(2);
+
+            // Every time the `offsetStorageReader.offset(params)` method is called we want to capture the offset params
+            // And return a different result based on the offset params passed in
+            // In this case, returning a different result based on the file path of the params
+            Capture<Map<String, Object>> captureOne = Capture.newInstance(CaptureType.ALL);
+            AtomicInteger executionNumber = new AtomicInteger();
+            EasyMock.expect(
+                    offsetStorageReader.offset(EasyMock.capture(captureOne))
+            ).andAnswer(() -> {
+                List<Map<String, Object>> capturedValues = captureOne.getValues();
+                Map<String, Object> captured = capturedValues.get(executionNumber.get());
+                executionNumber.addAndGet(1);
+                if (((String) (captured.get("path"))).endsWith(FILE_ALREADY_PROCESSED)) {
+                    return new HashMap<String, Object>() {{
+                        put("offset", (long) NUM_RECORDS);
+                        put("fileSizeBytes", NUM_BYTES_PER_FILE);
+                    }};
+                } else {
+                    return new HashMap<String, Object>() {{
+                        put("offset", (long) NUM_RECORDS / 2);
+                    }};
+                }
+            }).times(2);
 
             EasyMock.checkOrder(taskContext, false);
             EasyMock.replay(taskContext);
@@ -159,6 +176,21 @@ public void oneFilePerFs(TaskFsTestConfig fsConfig) throws IOException {
         assertNull(fsConfig.getTask().poll());
     }
 
+    @ParameterizedTest
+    @MethodSource("fileSystemConfigProvider")
+    public void skipsFetchingFileIfByteOffsetExistsAndMatchesFileLength(TaskFsTestConfig fsConfig) throws IOException {
+        for (Path dir : fsConfig.getDirectories()) {
+            //this file will be skipped since the byte offset for the file is equal to the byte size of the file
+            Path dataFile = new Path(dir, FILE_ALREADY_PROCESSED);
+            createDataFile(fsConfig.getFs(), dataFile);
+        }
+
+        fsConfig.getTask().start(fsConfig.getTaskConfig());
+        List<SourceRecord> records = fsConfig.getTask().poll();
+        assertEquals(0, records.size());
+        assertNull(fsConfig.getTask().poll());
+    }
+
     @ParameterizedTest
     @MethodSource("fileSystemConfigProvider")
     public void nonExistentUri(TaskFsTestConfig fsConfig) {
@@ -278,7 +310,7 @@ public void pollNoDataWithBatch(TaskFsTestConfig fsConfig) {
         Map<String, String> props = new HashMap<>(fsConfig.getTaskConfig());
         props.put(FsSourceTaskConfig.POLICY_BATCH_SIZE, "1");
         fsConfig.getTask().start(props);
-        
+
         assertEquals(0, fsConfig.getTask().poll().size());
         //policy has ended
         assertNull(fsConfig.getTask().poll());
@@ -321,7 +353,7 @@ public void oneFilePerFsWithBatch(TaskFsTestConfig fsConfig) throws IOException
         Map<String, String> props = new HashMap<>(fsConfig.getTaskConfig());
         props.put(FsSourceTaskConfig.POLICY_BATCH_SIZE, "1");
         fsConfig.getTask().start(props);
-        
+
         List<SourceRecord> records = new ArrayList<>();
         List<SourceRecord> fresh = fsConfig.getTask().poll();
         while (fresh != null) {
@@ -350,11 +382,11 @@ public void shouldNotSleepBetweenBatches(TaskFsTestConfig fsConfig) throws IOExc
         }
 
         fsConfig.getTask().start(props);
-        
+
         List<SourceRecord> records = new ArrayList<>();
         assertTimeoutPreemptively(Duration.ofSeconds(2), () -> {
             records.addAll(fsConfig.getTask().poll());
-            records.addAll(fsConfig.getTask().poll());    
+            records.addAll(fsConfig.getTask().poll());
         });
 
         assertEquals((NUM_RECORDS * fsConfig.getDirectories().size()) / 2, records.size());

From 84179b378f250dd5d128b5905db746cfc1f66de1 Mon Sep 17 00:00:00 2001
From: Grant Nicholas <43971820+grantatspothero@users.noreply.github.com>
Date: Fri, 26 Jun 2020 19:16:30 -0500
Subject: [PATCH 05/19] Chunk access to kafka connect offsets for better
 performance (#68)

---
 .../kafka/connect/fs/FsSourceTask.java        | 12 +++++-
 .../connect/fs/policy/AbstractPolicy.java     |  4 +-
 .../kafka/connect/fs/policy/Policy.java       |  3 +-
 .../connect/fs/task/FsSourceTaskTest.java     | 37 +++++++++++--------
 4 files changed, 34 insertions(+), 22 deletions(-)

diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java
index 425ae5e..e72206e 100644
--- a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java
+++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java
@@ -18,6 +18,7 @@
 import java.io.IOException;
 import java.util.*;
 import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.function.Function;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 import java.util.stream.StreamSupport;
@@ -74,10 +75,17 @@ public void start(Map<String, String> properties) {
     public List<SourceRecord> poll() {
         while (!stop.get() && policy != null && !policy.hasEnded()) {
             log.trace("Polling for new data...");
+            Function<FileMetadata, Map<String, Object>> makePartitionKey = (FileMetadata metadata) ->
+                    Collections.singletonMap("path", metadata.getPath());
 
-            List<SourceRecord> totalRecords = filesToProcess().map(metadata -> {
+            // Fetch all the offsets upfront to avoid fetching offsets once per file
+            List<FileMetadata> filesToProcess = filesToProcess().collect(Collectors.toList());
+            List<Map<String, Object>> partitions = filesToProcess.stream().map(makePartitionKey).collect(Collectors.toList());
+            Map<Map<String, Object>, Map<String, Object>> offsets = context.offsetStorageReader().offsets(partitions);
+
+            List<SourceRecord> totalRecords = filesToProcess.stream().map(metadata -> {
                 List<SourceRecord> records = new ArrayList<>();
-                try (FileReader reader = policy.offer(metadata, context.offsetStorageReader())) {
+                try (FileReader reader = policy.offer(metadata, offsets.get(makePartitionKey.apply(metadata)))) {
                     log.info("Processing records for file {}.", metadata);
                     while (reader.hasNext()) {
                         records.add(convert(metadata, reader.currentOffset() + 1, reader.next()));
diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java
index 0781364..a60416d 100644
--- a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java
+++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java
@@ -213,7 +213,7 @@ FileMetadata toMetadata(LocatedFileStatus fileStatus) {
     }
 
     @Override
-    public FileReader offer(FileMetadata metadata, OffsetStorageReader offsetStorageReader) {
+    public FileReader offer(FileMetadata metadata, Map<String, Object> offset) {
         FileSystem current = fileSystems.stream()
                 .filter(fs -> metadata.getPath().startsWith(fs.getWorkingDirectory().toString()))
                 .findFirst()
@@ -224,8 +224,6 @@ public FileReader offer(FileMetadata metadata, OffsetStorageReader offsetStorage
                 current, new Path(metadata.getPath()), conf.originals()
         );
         try {
-            Map<String, Object> partition = Collections.singletonMap("path", metadata.getPath());
-            Map<String, Object> offset = offsetStorageReader.offset(partition);
             if (offset != null && offset.get("offset") != null) {
                 Object offsetObject = offset.get("offset");
                 Object fileSizeBytesObject = offset.get("fileSizeBytes");
diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/Policy.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/Policy.java
index 370288f..c53e2c0 100644
--- a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/Policy.java
+++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/Policy.java
@@ -8,12 +8,13 @@
 import java.io.IOException;
 import java.util.Iterator;
 import java.util.List;
+import java.util.Map;
 
 public interface Policy extends Closeable {
 
     Iterator<FileMetadata> execute() throws IOException;
 
-    FileReader offer(FileMetadata metadata, OffsetStorageReader offsetStorageReader) throws IOException;
+    FileReader offer(FileMetadata metadata, Map<String, Object> offset) throws IOException;
 
     boolean hasEnded();
 
diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTest.java
index 029576b..5ba6cd5 100644
--- a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTest.java
+++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTest.java
@@ -46,7 +46,7 @@ public class FsSourceTaskTest {
     );
     private static final int NUM_RECORDS = 10;
     private static final long NUM_BYTES_PER_FILE = 390;
-    private static final String FILE_ALREADY_PROCESSED = "already_processed.txt";
+    private static final String FILE_ALREADY_PROCESSED = "0101010101010101.txt";
 
     @BeforeAll
     public static void initFs() throws IOException {
@@ -83,27 +83,32 @@ public void initTask() {
                     .andReturn(offsetStorageReader)
                     .times(2);
 
-            // Every time the `offsetStorageReader.offset(params)` method is called we want to capture the offset params
+            // Every time the `offsetStorageReader.offsets(params)` method is called we want to capture the offsets params
             // And return a different result based on the offset params passed in
             // In this case, returning a different result based on the file path of the params
-            Capture<Map<String, Object>> captureOne = Capture.newInstance(CaptureType.ALL);
+            Capture<Collection<Map<String, Object>>> captureOne = Capture.newInstance(CaptureType.ALL);
             AtomicInteger executionNumber = new AtomicInteger();
             EasyMock.expect(
-                    offsetStorageReader.offset(EasyMock.capture(captureOne))
+                    offsetStorageReader.offsets(EasyMock.capture(captureOne))
             ).andAnswer(() -> {
-                List<Map<String, Object>> capturedValues = captureOne.getValues();
-                Map<String, Object> captured = capturedValues.get(executionNumber.get());
+                List<Collection<Map<String, Object>>> capturedValues = captureOne.getValues();
+                Collection<Map<String, Object>> captured = capturedValues.get(executionNumber.get());
                 executionNumber.addAndGet(1);
-                if (((String) (captured.get("path"))).endsWith(FILE_ALREADY_PROCESSED)) {
-                    return new HashMap<String, Object>() {{
-                        put("offset", (long) NUM_RECORDS);
-                        put("fileSizeBytes", NUM_BYTES_PER_FILE);
-                    }};
-                } else {
-                    return new HashMap<String, Object>() {{
-                        put("offset", (long) NUM_RECORDS / 2);
-                    }};
-                }
+
+                Map<Map<String, Object>, Map<String, Object>> map = new HashMap<>();
+                captured.forEach(part ->{
+                    if (((String) (part.get("path"))).endsWith(FILE_ALREADY_PROCESSED)) {
+                        map.put(part, new HashMap<String, Object>() {{
+                            put("offset", (long) NUM_RECORDS);
+                            put("fileSizeBytes", NUM_BYTES_PER_FILE);
+                        }});
+                    } else {
+                        map.put(part, new HashMap<String, Object>() {{
+                            put("offset", (long) NUM_RECORDS / 2);
+                        }});
+                    }
+                });
+                return map;
             }).times(2);
 
             EasyMock.checkOrder(taskContext, false);

From ed4001816826d32c0343dff58527bac5213d631e Mon Sep 17 00:00:00 2001
From: Mario Molina <mmolimar@gmail.com>
Date: Sat, 27 Jun 2020 17:26:40 -0500
Subject: [PATCH 06/19] ORC file reader

---
 pom.xml                                       |   6 +
 .../fs/file/reader/AgnosticFileReader.java    |   7 +-
 .../connect/fs/file/reader/OrcFileReader.java | 267 ++++++++++++++++++
 .../file/reader/AgnosticFileReaderTest.java   |  21 ++
 .../fs/file/reader/OrcFileReaderTest.java     | 219 ++++++++++++++
 src/test/resources/log4j.properties           |   1 +
 6 files changed, 520 insertions(+), 1 deletion(-)
 create mode 100644 src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/OrcFileReader.java
 create mode 100644 src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/OrcFileReaderTest.java

diff --git a/pom.xml b/pom.xml
index bfe7448..a3e1c0c 100644
--- a/pom.xml
+++ b/pom.xml
@@ -16,6 +16,7 @@
         <hadoop.version>3.2.1</hadoop.version>
         <gcs-connector.version>hadoop3-2.1.2</gcs-connector.version>
         <parquet.version>1.11.0</parquet.version>
+        <orc.version>1.6.3</orc.version>
         <univocity.version>2.8.4</univocity.version>
         <cron-utils.version>9.0.2</cron-utils.version>
         <junit-jupiter.version>5.6.2</junit-jupiter.version>
@@ -74,6 +75,11 @@
             <artifactId>parquet-avro</artifactId>
             <version>${parquet.version}</version>
         </dependency>
+        <dependency>
+            <groupId>org.apache.orc</groupId>
+            <artifactId>orc-core</artifactId>
+            <version>${orc.version}</version>
+        </dependency>
         <dependency>
             <groupId>com.univocity</groupId>
             <artifactId>univocity-parsers</artifactId>
diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReader.java
index 2630762..2d0dbe5 100644
--- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReader.java
+++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReader.java
@@ -22,6 +22,7 @@ public class AgnosticFileReader extends AbstractFileReader<AgnosticFileReader.Ag
     public static final String FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET = FILE_READER_AGNOSTIC_EXTENSIONS + "parquet";
     public static final String FILE_READER_AGNOSTIC_EXTENSIONS_AVRO = FILE_READER_AGNOSTIC_EXTENSIONS + "avro";
     public static final String FILE_READER_AGNOSTIC_EXTENSIONS_SEQUENCE = FILE_READER_AGNOSTIC_EXTENSIONS + "sequence";
+    public static final String FILE_READER_AGNOSTIC_EXTENSIONS_ORC = FILE_READER_AGNOSTIC_EXTENSIONS + "orc";
     public static final String FILE_READER_AGNOSTIC_EXTENSIONS_JSON = FILE_READER_AGNOSTIC_EXTENSIONS + "json";
     public static final String FILE_READER_AGNOSTIC_EXTENSIONS_CSV = FILE_READER_AGNOSTIC_EXTENSIONS + "csv";
     public static final String FILE_READER_AGNOSTIC_EXTENSIONS_TSV = FILE_READER_AGNOSTIC_EXTENSIONS + "tsv";
@@ -29,7 +30,7 @@ public class AgnosticFileReader extends AbstractFileReader<AgnosticFileReader.Ag
     public static final String FILE_READER_AGNOSTIC_EXTENSIONS_TEXT = FILE_READER_AGNOSTIC_EXTENSIONS + "text";
 
     private final AbstractFileReader<Object> reader;
-    private Set<String> parquetExtensions, avroExtensions, sequenceExtensions,
+    private Set<String> parquetExtensions, avroExtensions, sequenceExtensions, orcExtensions,
             jsonExtensions, csvExtensions, tsvExtensions, fixedExtensions;
 
     public AgnosticFileReader(FileSystem fs, Path filePath, Map<String, Object> config) throws Exception {
@@ -54,6 +55,8 @@ private AbstractFileReader<Object> readerByExtension(FileSystem fs, Path filePat
             clz = AvroFileReader.class;
         } else if (sequenceExtensions.contains(extension)) {
             clz = SequenceFileReader.class;
+        } else if (orcExtensions.contains(extension)) {
+            clz = OrcFileReader.class;
         } else if (jsonExtensions.contains(extension)) {
             clz = JsonFileReader.class;
         } else if (csvExtensions.contains(extension)) {
@@ -77,6 +80,8 @@ protected void configure(Map<String, String> config) {
                 .toLowerCase().split(",")).collect(Collectors.toSet());
         this.sequenceExtensions = Arrays.stream(config.getOrDefault(FILE_READER_AGNOSTIC_EXTENSIONS_SEQUENCE, "seq")
                 .toLowerCase().split(",")).collect(Collectors.toSet());
+        this.orcExtensions = Arrays.stream(config.getOrDefault(FILE_READER_AGNOSTIC_EXTENSIONS_ORC, "orc")
+                .toLowerCase().split(",")).collect(Collectors.toSet());
         this.jsonExtensions = Arrays.stream(config.getOrDefault(FILE_READER_AGNOSTIC_EXTENSIONS_JSON, "json")
                 .toLowerCase().split(",")).collect(Collectors.toSet());
         this.csvExtensions = Arrays.stream(config.getOrDefault(FILE_READER_AGNOSTIC_EXTENSIONS_CSV, "csv")
diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/OrcFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/OrcFileReader.java
new file mode 100644
index 0000000..d14d738
--- /dev/null
+++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/OrcFileReader.java
@@ -0,0 +1,267 @@
+package com.github.mmolimar.kafka.connect.fs.file.reader;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.*;
+import org.apache.kafka.connect.data.Field;
+import org.apache.kafka.connect.data.Schema;
+import org.apache.kafka.connect.data.SchemaBuilder;
+import org.apache.kafka.connect.data.Struct;
+import org.apache.kafka.connect.errors.ConnectException;
+import org.apache.orc.OrcFile;
+import org.apache.orc.Reader;
+import org.apache.orc.RecordReader;
+import org.apache.orc.TypeDescription;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Map;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+import java.util.stream.LongStream;
+
+import static com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig.FILE_READER_PREFIX;
+
+public class OrcFileReader extends AbstractFileReader<OrcFileReader.OrcRecord> {
+
+    private static final String FILE_READER_ORC = FILE_READER_PREFIX + "orc.";
+
+    public static final String FILE_READER_ORC_USE_ZEROCOPY = FILE_READER_ORC + "use_zerocopy";
+    public static final String FILE_READER_ORC_SKIP_CORRUPT_RECORDS = FILE_READER_ORC + "skip_corrupt_records";
+
+    private final RecordReader reader;
+    private final VectorizedRowBatch batch;
+    private final Schema schema;
+    private final long numberOfRows;
+    private boolean useZeroCopy;
+    private boolean skipCorruptRecords;
+    private int vectorIndex;
+    private boolean closed;
+
+    public OrcFileReader(FileSystem fs, Path filePath, Map<String, Object> config) throws IOException {
+        super(fs, filePath, new OrcToStruct(), config);
+
+        Reader orcReader = OrcFile.createReader(filePath, OrcFile.readerOptions(fs.getConf()));
+        Reader.Options options = new Reader.Options(fs.getConf())
+                .useZeroCopy(this.useZeroCopy)
+                .skipCorruptRecords(this.skipCorruptRecords);
+        this.reader = orcReader.rows(options);
+        this.numberOfRows = orcReader.getNumberOfRows();
+        this.batch = orcReader.getSchema().createRowBatch();
+        this.schema = hasNext() ? extractSchema(orcReader.getSchema()) : SchemaBuilder.struct().build();
+        this.vectorIndex = 0;
+        this.closed = false;
+    }
+
+    @Override
+    protected void configure(Map<String, String> config) {
+        this.useZeroCopy = Boolean.parseBoolean(config.getOrDefault(FILE_READER_ORC_USE_ZEROCOPY, "false"));
+        this.skipCorruptRecords = Boolean.parseBoolean(config.getOrDefault(FILE_READER_ORC_SKIP_CORRUPT_RECORDS, "false"));
+    }
+
+    private Schema extractSchema(TypeDescription typeDescription) {
+        switch (typeDescription.getCategory()) {
+            case BOOLEAN:
+                return Schema.OPTIONAL_BOOLEAN_SCHEMA;
+            case BYTE:
+            case CHAR:
+                return Schema.OPTIONAL_INT8_SCHEMA;
+            case SHORT:
+                return Schema.OPTIONAL_INT16_SCHEMA;
+            case INT:
+                return Schema.OPTIONAL_INT32_SCHEMA;
+            case DATE:
+            case TIMESTAMP:
+            case TIMESTAMP_INSTANT:
+            case LONG:
+                return Schema.OPTIONAL_INT64_SCHEMA;
+            case FLOAT:
+                return Schema.OPTIONAL_FLOAT32_SCHEMA;
+            case DECIMAL:
+            case DOUBLE:
+                return Schema.OPTIONAL_FLOAT64_SCHEMA;
+            case VARCHAR:
+            case STRING:
+                return Schema.OPTIONAL_STRING_SCHEMA;
+            case BINARY:
+                return Schema.OPTIONAL_BYTES_SCHEMA;
+            case LIST:
+                Schema arraySchema = typeDescription.getChildren().stream()
+                        .findFirst().map(this::extractSchema)
+                        .orElse(SchemaBuilder.struct().build());
+                return SchemaBuilder.array(arraySchema).build();
+            case UNION:
+                // union data types are mapped as structs with a faked key.
+                SchemaBuilder mapBuilder = SchemaBuilder.struct();
+                IntStream.range(0, typeDescription.getChildren().size())
+                        .forEach(index -> mapBuilder.field(
+                                "field" + (index + 1),
+                                extractSchema(typeDescription.getChildren().get(index))
+                        ));
+                return mapBuilder.build();
+            case STRUCT:
+                SchemaBuilder structBuilder = SchemaBuilder.struct();
+                IntStream.range(0, typeDescription.getChildren().size())
+                        .forEach(index ->
+                                structBuilder.field(typeDescription.getFieldNames().get(index),
+                                        extractSchema(typeDescription.getChildren().get(index))));
+                return structBuilder.build();
+            case MAP:
+                return SchemaBuilder.map(extractSchema(typeDescription.getChildren().get(0)),
+                        extractSchema(typeDescription.getChildren().get(1)))
+                        .build();
+            default:
+                throw new ConnectException("Data type '" + typeDescription.getCategory() + "' in ORC file " +
+                        "is not supported.");
+        }
+    }
+
+    @Override
+    public boolean hasNextRecord() throws IOException {
+        if (vectorIndex >= batch.size) {
+            vectorIndex = 0;
+            reader.nextBatch(batch);
+        }
+        return batch.size > 0;
+    }
+
+    @Override
+    protected OrcRecord nextRecord() {
+        incrementOffset();
+        return new OrcRecord(schema, batch, vectorIndex++);
+    }
+
+    @Override
+    public void seekFile(long offset) throws IOException {
+        reader.seekToRow(Math.min(offset, numberOfRows - 1));
+        if (offset >= numberOfRows) {
+            reader.nextBatch(batch);
+        }
+        setOffset(offset);
+        vectorIndex = Integer.MAX_VALUE;
+    }
+
+    @Override
+    public void close() throws IOException {
+        closed = true;
+        reader.close();
+    }
+
+    @Override
+    public boolean isClosed() {
+        return closed;
+    }
+
+    static class OrcToStruct implements ReaderAdapter<OrcRecord> {
+
+        @Override
+        public Struct apply(OrcRecord record) {
+            return toStruct(record.schema, record.batch, record.vectorIndex);
+        }
+
+        private Struct toStruct(Schema schema, VectorizedRowBatch batch, int vectorIndex) {
+            Struct struct = new Struct(schema);
+            IntStream.range(0, schema.fields().size())
+                    .forEach(index -> {
+                        Field field = schema.fields().get(index);
+                        struct.put(field.name(), mapValue(field.schema(), batch.cols[index], vectorIndex));
+                    });
+            return struct;
+        }
+
+        private Object mapValue(Schema schema, ColumnVector value, long vectorIndex) {
+            switch (value.getClass().getSimpleName()) {
+                case "BytesColumnVector":
+                    BytesColumnVector bytes = ((BytesColumnVector) value);
+                    if (bytes.vector[(int) vectorIndex] == null) {
+                        return null;
+                    }
+                    String content = new String(
+                            bytes.vector[(int) vectorIndex],
+                            bytes.start[(int) vectorIndex],
+                            bytes.length[(int) vectorIndex]
+                    );
+                    switch (schema.type()) {
+                        case INT8:
+                            return (byte) content.charAt(0);
+                        case BYTES:
+                            return content.getBytes();
+                        default:
+                            return content;
+                    }
+                case "DecimalColumnVector":
+                    return ((DecimalColumnVector) value).vector[(int) vectorIndex].doubleValue();
+                case "DoubleColumnVector":
+                    if (schema.type() == Schema.Type.FLOAT32) {
+                        return (float) ((DoubleColumnVector) value).vector[(int) vectorIndex];
+                    }
+                    return ((DoubleColumnVector) value).vector[(int) vectorIndex];
+                case "IntervalDayTimeColumnVector":
+                    return ((IntervalDayTimeColumnVector) value).getNanos(0);
+                case "Decimal64ColumnVector":
+                case "DateColumnVector":
+                case "LongColumnVector":
+                    long castedValue = ((LongColumnVector) value).vector[(int) vectorIndex];
+                    switch (schema.type()) {
+                        case BOOLEAN:
+                            return castedValue != 0;
+                        case INT8:
+                            return (byte) castedValue;
+                        case INT16:
+                            return (short) castedValue;
+                        case INT32:
+                            return (int) castedValue;
+                        default:
+                            return castedValue;
+                    }
+                case "ListColumnVector":
+                    ListColumnVector list = ((ListColumnVector) value);
+                    return LongStream.range(0, list.lengths[(int) vectorIndex])
+                            .mapToObj(index -> mapValue(schema.valueSchema(), list.child, list.offsets[(int) vectorIndex]))
+                            .collect(Collectors.toList());
+                case "MapColumnVector":
+                    MapColumnVector map = ((MapColumnVector) value);
+                    return Collections.singletonMap(
+                            mapValue(schema.keySchema(), map.keys, map.offsets[(int) vectorIndex]),
+                            mapValue(schema.valueSchema(), map.values, map.offsets[(int) vectorIndex])
+                    );
+                case "UnionColumnVector":
+                case "StructColumnVector":
+                    ColumnVector[] fields;
+                    if (value instanceof StructColumnVector) {
+                        fields = ((StructColumnVector) value).fields;
+                    } else {
+                        fields = ((UnionColumnVector) value).fields;
+                    }
+                    Struct struct = new Struct(schema);
+                    IntStream.range(0, fields.length)
+                            .forEach(index -> {
+                                String structKey = schema.fields().get(index).name();
+                                Object structValue = mapValue(schema.fields().get(index).schema(),
+                                        fields[index], vectorIndex);
+                                struct.put(structKey, structValue);
+                            });
+                    return struct;
+                case "TimestampColumnVector":
+                    return ((TimestampColumnVector) value).time[(int) vectorIndex];
+                case "VoidColumnVector":
+                default:
+                    return null;
+            }
+        }
+    }
+
+    static class OrcRecord {
+
+        private final Schema schema;
+        private final VectorizedRowBatch batch;
+        private final int vectorIndex;
+
+        OrcRecord(Schema schema, VectorizedRowBatch batch, int vectorIndex) {
+            this.schema = schema;
+            this.batch = batch;
+            this.vectorIndex = vectorIndex;
+        }
+
+    }
+}
diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReaderTest.java
index ab44e27..3c01e86 100644
--- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReaderTest.java
+++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReaderTest.java
@@ -181,4 +181,25 @@ public String getFileExtension() {
         }
     }
 
+    @Nested
+    class AgnosticOrcFileReaderTest extends OrcFileReaderTest {
+
+        @Override
+        protected Map<String, Object> getReaderConfig() {
+            Map<String, Object> config = super.getReaderConfig();
+            config.put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_ORC, getFileExtension());
+            return config;
+        }
+
+        @Override
+        public Class<? extends FileReader> getReaderClass() {
+            return AgnosticFileReader.class;
+        }
+
+        @Override
+        public String getFileExtension() {
+            return FILE_EXTENSION;
+        }
+    }
+
 }
diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/OrcFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/OrcFileReaderTest.java
new file mode 100644
index 0000000..75e9f7a
--- /dev/null
+++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/OrcFileReaderTest.java
@@ -0,0 +1,219 @@
+package com.github.mmolimar.kafka.connect.fs.file.reader;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.common.type.HiveDecimal;
+import org.apache.hadoop.hive.ql.exec.vector.*;
+import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
+import org.apache.kafka.connect.data.Struct;
+import org.apache.orc.OrcConf;
+import org.apache.orc.OrcFile;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.Writer;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.MethodSource;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.*;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+public class OrcFileReaderTest extends FileReaderTestBase {
+
+    private static final String FIELD_INTEGER = "integerField";
+    private static final String FIELD_LONG = "longField";
+    private static final String FIELD_BOOLEAN = "booleanField";
+    private static final String FIELD_STRING = "stringField";
+    private static final String FIELD_DECIMAL = "decimalField";
+    private static final String FIELD_ARRAY = "arrayField";
+    private static final String FIELD_MAP = "mapField";
+    private static final String FIELD_STRUCT = "structField";
+    private static final String FIELD_UNION = "unionField";
+    private static final String FIELD_EMPTY = "emptyField";
+    private static final String FIELD_BYTE = "byteField";
+    private static final String FIELD_CHAR = "charField";
+    private static final String FIELD_SHORT = "shortField";
+    private static final String FIELD_FLOAT = "floatField";
+    private static final String FIELD_BINARY = "binaryField";
+    private static final String FIELD_DATE = "dateField";
+    private static final String FIELD_TIMESTAMP = "timestampField";
+    private static final String FIELD_TIMESTAMP_INSTANT = "timestampInstantField";
+    private static final String FILE_EXTENSION = "rc";
+
+    @Override
+    protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throws IOException {
+        int numRecords = args.length < 1 ? NUM_RECORDS : (int) args[0];
+        File orcFile = File.createTempFile("test-", "." + getFileExtension());
+
+        TypeDescription schema = TypeDescription.createStruct();
+        schema.addField(FIELD_INTEGER, TypeDescription.createInt());
+        schema.addField(FIELD_LONG, TypeDescription.createLong());
+        schema.addField(FIELD_STRING, TypeDescription.createString());
+        schema.addField(FIELD_BOOLEAN, TypeDescription.createBoolean());
+        schema.addField(FIELD_DECIMAL, TypeDescription.createDecimal());
+        schema.addField(FIELD_EMPTY, TypeDescription.createVarchar());
+        schema.addField(FIELD_BYTE, TypeDescription.createByte());
+        schema.addField(FIELD_CHAR, TypeDescription.createChar());
+        schema.addField(FIELD_SHORT, TypeDescription.createShort());
+        schema.addField(FIELD_FLOAT, TypeDescription.createFloat());
+        schema.addField(FIELD_BINARY, TypeDescription.createBinary());
+        schema.addField(FIELD_DATE, TypeDescription.createDate());
+        schema.addField(FIELD_TIMESTAMP, TypeDescription.createTimestamp());
+        schema.addField(FIELD_TIMESTAMP_INSTANT, TypeDescription.createTimestampInstant());
+        schema.addField(FIELD_ARRAY, TypeDescription.createList(TypeDescription.createString()));
+        schema.addField(FIELD_MAP, TypeDescription.createMap(TypeDescription.createString(), TypeDescription.createString()));
+        TypeDescription struct = TypeDescription.createStruct();
+        struct.addField(FIELD_INTEGER, TypeDescription.createInt());
+        struct.addField(FIELD_LONG, TypeDescription.createLong());
+        struct.addField(FIELD_STRING, TypeDescription.createString());
+        struct.addField(FIELD_BOOLEAN, TypeDescription.createBoolean());
+        struct.addField(FIELD_DECIMAL, TypeDescription.createDecimal());
+        struct.addField(FIELD_EMPTY, TypeDescription.createVarchar());
+        schema.addField(FIELD_STRUCT, struct);
+        TypeDescription union = TypeDescription.createUnion();
+        union.addUnionChild(TypeDescription.createInt());
+        schema.addField(FIELD_UNION, union);
+
+        Properties props = new Properties();
+        props.put(OrcConf.OVERWRITE_OUTPUT_FILE.getAttribute(), "true");
+        OrcFile.WriterOptions opts = OrcFile.writerOptions(props, fsConfig.getFs().getConf()).setSchema(schema);
+        try (Writer writer = OrcFile.createWriter(new Path(orcFile.toURI()), opts)) {
+            VectorizedRowBatch batch = schema.createRowBatch(numRecords);
+            LongColumnVector f1 = (LongColumnVector) batch.cols[0];
+            LongColumnVector f2 = (LongColumnVector) batch.cols[1];
+            BytesColumnVector f3 = (BytesColumnVector) batch.cols[2];
+            LongColumnVector f4 = (LongColumnVector) batch.cols[3];
+            DecimalColumnVector f5 = (DecimalColumnVector) batch.cols[4];
+            BytesColumnVector f6 = (BytesColumnVector) batch.cols[5];
+            LongColumnVector f7 = (LongColumnVector) batch.cols[6];
+            BytesColumnVector f8 = (BytesColumnVector) batch.cols[7];
+            LongColumnVector f9 = (LongColumnVector) batch.cols[8];
+            DoubleColumnVector f10 = (DoubleColumnVector) batch.cols[9];
+            BytesColumnVector f11 = (BytesColumnVector) batch.cols[10];
+            DateColumnVector f12 = (DateColumnVector) batch.cols[11];
+            TimestampColumnVector f13 = (TimestampColumnVector) batch.cols[12];
+            TimestampColumnVector f14 = (TimestampColumnVector) batch.cols[13];
+            ListColumnVector f15 = (ListColumnVector) batch.cols[14];
+            MapColumnVector f16 = (MapColumnVector) batch.cols[15];
+            StructColumnVector f17 = (StructColumnVector) batch.cols[16];
+            UnionColumnVector f18 = (UnionColumnVector) batch.cols[17];
+
+            for (int index = 0; index < numRecords; index++) {
+                f1.vector[index] = index;
+                f2.vector[index] = Long.MAX_VALUE;
+                f3.setVal(index, String.format("%d_%s", index, UUID.randomUUID()).getBytes());
+                f4.vector[index] = 1;
+                f5.vector[index] = new HiveDecimalWritable(HiveDecimal.create(Double.parseDouble(index + "." + index)));
+                f6.setVal(index, new byte[0]);
+
+                f7.vector[index] = index;
+                f8.setVal(index, new byte[]{(byte) (index % 32)});
+                f9.vector[index] = (short) index;
+                f10.vector[index] = Float.parseFloat(index + "." + index);
+                f11.setVal(index, String.format("%d_%s", index, UUID.randomUUID()).getBytes());
+                f12.vector[index] = Calendar.getInstance().getTimeInMillis();
+                f13.time[index] = Calendar.getInstance().getTimeInMillis();
+                f14.time[index] = Calendar.getInstance().getTimeInMillis();
+
+                ((BytesColumnVector) f15.child).setVal(index, ("elm[" + index + "]").getBytes());
+                f15.lengths[index] = 1;
+                f15.offsets[index] = f15.childCount++;
+
+                ((BytesColumnVector) f16.keys).setVal(index, ("key[" + index + "]").getBytes());
+                ((BytesColumnVector) f16.values).setVal(index, ("value[" + index + "]").getBytes());
+                f16.lengths[index] = 1;
+                f16.offsets[index] = f16.childCount++;
+
+                ((LongColumnVector) f17.fields[0]).vector[index] = index;
+                ((LongColumnVector) f17.fields[1]).vector[index] = Long.MAX_VALUE;
+                ((BytesColumnVector) f17.fields[2]).setVal(index,
+                        String.format("%d_%s", index, UUID.randomUUID()).getBytes());
+                ((LongColumnVector) f17.fields[3]).vector[index] = 1;
+                ((DecimalColumnVector) f17.fields[4]).vector[index] =
+                        new HiveDecimalWritable(HiveDecimal.create(Double.parseDouble(index + "." + index)));
+                ((BytesColumnVector) f17.fields[5]).setVal(index, new byte[0]);
+
+                ((LongColumnVector) f18.fields[0]).vector[index] = index;
+
+                fsConfig.offsetsByIndex().put(index, (long) index);
+            }
+            batch.size = numRecords;
+            writer.addRowBatch(batch);
+        }
+        Path path = new Path(new Path(fsConfig.getFsUri()), orcFile.getName());
+        fsConfig.getFs().moveFromLocalFile(new Path(orcFile.getAbsolutePath()), path);
+        return path;
+    }
+
+    @ParameterizedTest
+    @MethodSource("fileSystemConfigProvider")
+    public void emptyFile(ReaderFsTestConfig fsConfig) throws IOException {
+        File tmp = File.createTempFile("test-", "." + getFileExtension());
+        Path path = new Path(new Path(fsConfig.getFsUri()), tmp.getName());
+        fsConfig.getFs().moveFromLocalFile(new Path(tmp.getAbsolutePath()), path);
+        FileReader reader = getReader(fsConfig.getFs(), path, getReaderConfig());
+        assertFalse(reader.hasNext());
+    }
+
+    @ParameterizedTest
+    @MethodSource("fileSystemConfigProvider")
+    public void useZeroCopyConfig(ReaderFsTestConfig fsConfig) {
+        Map<String, Object> readerConfig = getReaderConfig();
+        readerConfig.put(OrcFileReader.FILE_READER_ORC_USE_ZEROCOPY, "true");
+        fsConfig.setReader(getReader(fsConfig.getFs(), fsConfig.getDataFile(), readerConfig));
+        readAllData(fsConfig);
+    }
+
+    @ParameterizedTest
+    @MethodSource("fileSystemConfigProvider")
+    public void skipCorruptRecordsConfig(ReaderFsTestConfig fsConfig) {
+        Map<String, Object> readerConfig = getReaderConfig();
+        readerConfig.put(OrcFileReader.FILE_READER_ORC_SKIP_CORRUPT_RECORDS, "true");
+        fsConfig.setReader(getReader(fsConfig.getFs(), fsConfig.getDataFile(), readerConfig));
+        readAllData(fsConfig);
+    }
+
+    @Override
+    protected Class<? extends FileReader> getReaderClass() {
+        return OrcFileReader.class;
+    }
+
+    @Override
+    protected Map<String, Object> getReaderConfig() {
+        return new HashMap<>();
+    }
+
+    @Override
+    protected void checkData(Struct record, long index) {
+        Struct struct = record.getStruct(FIELD_STRUCT);
+        Struct union = record.getStruct(FIELD_UNION);
+        assertAll(
+                () -> assertEquals(index, (int) (Integer) record.get(FIELD_INTEGER)),
+                () -> assertEquals(Long.MAX_VALUE, (long) (Long) record.get(FIELD_LONG)),
+                () -> assertTrue(record.get(FIELD_STRING).toString().startsWith(index + "_")),
+                () -> assertTrue(Boolean.parseBoolean(record.get(FIELD_BOOLEAN).toString())),
+                () -> assertEquals(Double.parseDouble(index + "." + index), (Double) record.get(FIELD_DECIMAL), 0),
+                () -> assertEquals("", record.get(FIELD_EMPTY)),
+                () -> assertEquals(index, ((Byte) record.get(FIELD_BYTE)).intValue()),
+                () -> assertEquals((byte) (index % 32), record.get(FIELD_CHAR)),
+                () -> assertEquals((short) index, (short) record.get(FIELD_SHORT)),
+                () -> assertEquals(Float.parseFloat(index + "." + index), record.get(FIELD_FLOAT)),
+                () -> assertTrue(new String((byte[]) record.get(FIELD_BINARY)).startsWith(index + "_")),
+                () -> assertEquals(Collections.singletonMap("key[" + index + "]", "value[" + index + "]"), record.get(FIELD_MAP)),
+                () -> assertEquals(Collections.singletonList("elm[" + index + "]"), record.get(FIELD_ARRAY)),
+                () -> assertEquals(index, (int) (Integer) struct.get(FIELD_INTEGER)),
+                () -> assertEquals(Long.MAX_VALUE, (long) (Long) struct.get(FIELD_LONG)),
+                () -> assertTrue(struct.get(FIELD_STRING).toString().startsWith(index + "_")),
+                () -> assertTrue(Boolean.parseBoolean(struct.get(FIELD_BOOLEAN).toString())),
+                () -> assertEquals(Double.parseDouble(index + "." + index), (Double) struct.get(FIELD_DECIMAL), 0),
+                () -> assertEquals("", struct.get(FIELD_EMPTY)),
+                () -> assertNotNull(struct.schema().field(FIELD_EMPTY)),
+                () -> assertEquals((int) index, union.get("field1"))
+        );
+    }
+
+    @Override
+    protected String getFileExtension() {
+        return FILE_EXTENSION;
+    }
+}
diff --git a/src/test/resources/log4j.properties b/src/test/resources/log4j.properties
index bb7782f..b926aa8 100644
--- a/src/test/resources/log4j.properties
+++ b/src/test/resources/log4j.properties
@@ -11,5 +11,6 @@ log4j.logger.com.github.mmolimar.kafka.connect.fs=TRACE
 log4j.logger.org.apache.hadoop=ERROR
 log4j.logger.BlockStateChange=WARN
 log4j.logger.org.apache.parquet=WARN
+log4j.logger.org.apache.orc=WARN
 log4j.logger.org.eclipse.jetty=WARN
 log4j.logger.io.confluent.connect.avro=WARN

From 2c094e6a9044b790b7e6c5b6a6b0a8fb8b3b6bf4 Mon Sep 17 00:00:00 2001
From: Mario Molina <mmolimar@gmail.com>
Date: Sat, 27 Jun 2020 18:12:07 -0500
Subject: [PATCH 07/19] Updating docs

---
 docs/source/config_options.rst | 31 ++++++++++++++++++++++++++++++-
 docs/source/filereaders.rst    | 19 ++++++++++++++++++-
 2 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/docs/source/config_options.rst b/docs/source/config_options.rst
index 47995e6..b160dda 100644
--- a/docs/source/config_options.rst
+++ b/docs/source/config_options.rst
@@ -250,7 +250,29 @@ In order to configure custom properties for this reader, the name you must use i
   * Type: string
   * Importance: medium
 
-.. _config_options-filereaders-sequencefile:
+.. _config_options-filereaders-orc:
+
+ORC
+--------------------------------------------
+
+In order to configure custom properties for this reader, the name you must use is ``orc``.
+
+``file_reader.orc.use_zerocopy``
+  Use zero-copy when reading a ORC file.
+
+  * Type: boolean
+  * Default: ``false``
+  * Importance: medium
+
+``file_reader.orc.skip_corrupt_records``
+  If reader will skip corrupt data or not. If disabled, an exception will be thrown when there is
+  corrupted data in the file.
+
+  * Type: boolean
+  * Default: ``false``
+  * Importance: medium
+
+.. _config_options-filereaders-json:
 
 SequenceFile
 --------------------------------------------
@@ -882,6 +904,13 @@ To configure custom properties for this reader, the name you must use is ``agnos
   * Default: ``avro``
   * Importance: medium
 
+``file_reader.agnostic.extensions.orc``
+  A comma-separated string list with the accepted extensions for ORC files.
+
+  * Type: string
+  * Default: ``orc``
+  * Importance: medium
+
 ``file_reader.agnostic.extensions.sequence``
   A comma-separated string list with the accepted extensions for Sequence files.
 
diff --git a/docs/source/filereaders.rst b/docs/source/filereaders.rst
index f887499..d38a5e9 100644
--- a/docs/source/filereaders.rst
+++ b/docs/source/filereaders.rst
@@ -26,6 +26,22 @@ way as the Avro file reader does.
 
 More information about properties of this file reader :ref:`here<config_options-filereaders-parquet>`.
 
+ORC
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+`ORC files <https://orc.apache.org>`__ are a self-describing type-aware
+columnar file format designed for Hadoop workloads.
+
+This reader can process this file format, translating its schema and building
+a Kafka message with the content.
+
+.. warning:: If you have ORC files with ``union`` data types, this sort of
+             data types will be transformed in a ``map`` object in the Kafka message.
+             The value of each key will be ``fieldN``, where ``N`` represents
+             the index within the data type.
+
+More information about properties of this file reader :ref:`here<config_options-filereaders-orc>`.
+
 SequenceFile
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -100,13 +116,14 @@ Agnostic
 Actually, this reader is a wrapper of the readers listing above.
 
 It tries to read any kind of file format using an internal reader based on the file extension,
-applying the proper one (Parquet, Avro, SequenceFile, CSV, TSV or Text). In case of no
+applying the proper one (Parquet, Avro, ORC, SequenceFile, CSV, TSV or Text). In case of no
 extension has been matched, the Text file reader will be applied.
 
 Default extensions for each format (configurable):
 
 * Parquet: ``.parquet``
 * Avro: ``.avro``
+* ORC: ``.orc``
 * SequenceFile: ``.seq``
 * JSON: ``.json``
 * CSV: ``.csv``

From 4a360388932b72ca4134bbc9b81abf841b6fb936 Mon Sep 17 00:00:00 2001
From: Mario Molina <mmolimar@gmail.com>
Date: Sun, 28 Jun 2020 11:14:00 -0500
Subject: [PATCH 08/19] ORC files without nested fields

---
 .../connect/fs/file/reader/OrcFileReader.java | 12 +++-
 .../fs/file/reader/OrcFileReaderTest.java     | 57 +++++++++++++++++--
 2 files changed, 64 insertions(+), 5 deletions(-)

diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/OrcFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/OrcFileReader.java
index d14d738..9db0edd 100644
--- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/OrcFileReader.java
+++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/OrcFileReader.java
@@ -48,7 +48,7 @@ public OrcFileReader(FileSystem fs, Path filePath, Map<String, Object> config) t
         this.reader = orcReader.rows(options);
         this.numberOfRows = orcReader.getNumberOfRows();
         this.batch = orcReader.getSchema().createRowBatch();
-        this.schema = hasNext() ? extractSchema(orcReader.getSchema()) : SchemaBuilder.struct().build();
+        this.schema = hasNext() ? buildSchema(orcReader.getSchema()) : SchemaBuilder.struct().build();
         this.vectorIndex = 0;
         this.closed = false;
     }
@@ -59,6 +59,16 @@ protected void configure(Map<String, String> config) {
         this.skipCorruptRecords = Boolean.parseBoolean(config.getOrDefault(FILE_READER_ORC_SKIP_CORRUPT_RECORDS, "false"));
     }
 
+    private Schema buildSchema(TypeDescription typeDescription) {
+        TypeDescription td;
+        if (typeDescription.getChildren() == null || typeDescription.getChildren().isEmpty()) {
+            td = TypeDescription.createStruct().addField(typeDescription.getCategory().getName(), typeDescription);
+        } else {
+            td = typeDescription;
+        }
+        return extractSchema(td);
+    }
+
     private Schema extractSchema(TypeDescription typeDescription) {
         switch (typeDescription.getCategory()) {
             case BOOLEAN:
diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/OrcFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/OrcFileReaderTest.java
index 75e9f7a..a34a7de 100644
--- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/OrcFileReaderTest.java
+++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/OrcFileReaderTest.java
@@ -145,6 +145,31 @@ protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throw
         return path;
     }
 
+    private Path createDataFileWithoutStruct(ReaderFsTestConfig fsConfig, Object... args) throws IOException {
+        int numRecords = args.length < 1 ? NUM_RECORDS : (int) args[0];
+        File orcFile = File.createTempFile("test-", "." + getFileExtension());
+
+        TypeDescription schema = TypeDescription.createLong();
+        Properties props = new Properties();
+        props.put(OrcConf.OVERWRITE_OUTPUT_FILE.getAttribute(), "true");
+        OrcFile.WriterOptions opts = OrcFile.writerOptions(props, fsConfig.getFs().getConf()).setSchema(schema);
+        try (Writer writer = OrcFile.createWriter(new Path(orcFile.toURI()), opts)) {
+            VectorizedRowBatch batch = schema.createRowBatch(numRecords);
+            LongColumnVector longField = (LongColumnVector) batch.cols[0];
+
+            for (int index = 0; index < numRecords; index++) {
+                longField.vector[index] = index;
+
+                fsConfig.offsetsByIndex().put(index, (long) index);
+            }
+            batch.size = numRecords;
+            writer.addRowBatch(batch);
+        }
+        Path path = new Path(new Path(fsConfig.getFsUri()), orcFile.getName());
+        fsConfig.getFs().moveFromLocalFile(new Path(orcFile.getAbsolutePath()), path);
+        return path;
+    }
+
     @ParameterizedTest
     @MethodSource("fileSystemConfigProvider")
     public void emptyFile(ReaderFsTestConfig fsConfig) throws IOException {
@@ -173,6 +198,24 @@ public void skipCorruptRecordsConfig(ReaderFsTestConfig fsConfig) {
         readAllData(fsConfig);
     }
 
+    @ParameterizedTest
+    @MethodSource("fileSystemConfigProvider")
+    public void readDataWithoutStruct(ReaderFsTestConfig fsConfig) throws IOException {
+        Path file = createDataFileWithoutStruct(fsConfig, NUM_RECORDS);
+        FileReader reader = getReader(fsConfig.getFs(), file, getReaderConfig());
+
+        assertTrue(reader.hasNext());
+
+        int recordCount = 0;
+        while (reader.hasNext()) {
+            Struct record = reader.next();
+            checkDataWithoutStruct(record, recordCount);
+            recordCount++;
+        }
+        reader.close();
+        assertEquals(NUM_RECORDS, recordCount, "The number of records in the file does not match");
+    }
+
     @Override
     protected Class<? extends FileReader> getReaderClass() {
         return OrcFileReader.class;
@@ -188,8 +231,8 @@ protected void checkData(Struct record, long index) {
         Struct struct = record.getStruct(FIELD_STRUCT);
         Struct union = record.getStruct(FIELD_UNION);
         assertAll(
-                () -> assertEquals(index, (int) (Integer) record.get(FIELD_INTEGER)),
-                () -> assertEquals(Long.MAX_VALUE, (long) (Long) record.get(FIELD_LONG)),
+                () -> assertEquals(index, (int) record.get(FIELD_INTEGER)),
+                () -> assertEquals(Long.MAX_VALUE, (long) record.get(FIELD_LONG)),
                 () -> assertTrue(record.get(FIELD_STRING).toString().startsWith(index + "_")),
                 () -> assertTrue(Boolean.parseBoolean(record.get(FIELD_BOOLEAN).toString())),
                 () -> assertEquals(Double.parseDouble(index + "." + index), (Double) record.get(FIELD_DECIMAL), 0),
@@ -199,10 +242,12 @@ protected void checkData(Struct record, long index) {
                 () -> assertEquals((short) index, (short) record.get(FIELD_SHORT)),
                 () -> assertEquals(Float.parseFloat(index + "." + index), record.get(FIELD_FLOAT)),
                 () -> assertTrue(new String((byte[]) record.get(FIELD_BINARY)).startsWith(index + "_")),
+                () -> assertNotNull(record.get(FIELD_TIMESTAMP)),
+                () -> assertNotNull(record.get(FIELD_TIMESTAMP_INSTANT)),
                 () -> assertEquals(Collections.singletonMap("key[" + index + "]", "value[" + index + "]"), record.get(FIELD_MAP)),
                 () -> assertEquals(Collections.singletonList("elm[" + index + "]"), record.get(FIELD_ARRAY)),
-                () -> assertEquals(index, (int) (Integer) struct.get(FIELD_INTEGER)),
-                () -> assertEquals(Long.MAX_VALUE, (long) (Long) struct.get(FIELD_LONG)),
+                () -> assertEquals(index, (int) struct.get(FIELD_INTEGER)),
+                () -> assertEquals(Long.MAX_VALUE, (long) struct.get(FIELD_LONG)),
                 () -> assertTrue(struct.get(FIELD_STRING).toString().startsWith(index + "_")),
                 () -> assertTrue(Boolean.parseBoolean(struct.get(FIELD_BOOLEAN).toString())),
                 () -> assertEquals(Double.parseDouble(index + "." + index), (Double) struct.get(FIELD_DECIMAL), 0),
@@ -212,6 +257,10 @@ protected void checkData(Struct record, long index) {
         );
     }
 
+    private void checkDataWithoutStruct(Struct record, long index) {
+        assertEquals(index, (long) record.get("bigint"));
+    }
+
     @Override
     protected String getFileExtension() {
         return FILE_EXTENSION;

From 43036c419129f9d9d630dfd60d376190ca4dc05d Mon Sep 17 00:00:00 2001
From: Mario Molina <mmolimar@gmail.com>
Date: Sun, 28 Jun 2020 11:16:17 -0500
Subject: [PATCH 09/19] Fix in schema generation for JSON records with arrays
 of objects

---
 .../fs/file/reader/JsonFileReader.java        |  8 ++-
 .../fs/file/reader/JsonFileReaderTest.java    | 57 ++++++++++++++-----
 2 files changed, 49 insertions(+), 16 deletions(-)

diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReader.java
index 3fabc01..4036250 100644
--- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReader.java
+++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReader.java
@@ -157,8 +157,10 @@ private Struct toStruct(Schema schema, JsonNode jsonNode) {
             if (jsonNode.isNull()) return null;
             Struct struct = new Struct(schema);
             jsonNode.fields()
-                    .forEachRemaining(field -> struct.put(field.getKey(),
-                            mapValue(struct.schema().field(field.getKey()).schema(), field.getValue())));
+                    .forEachRemaining(field -> struct.put(
+                            field.getKey(),
+                            mapValue(struct.schema().field(field.getKey()).schema(), field.getValue())
+                    ));
             return struct;
         }
 
@@ -204,7 +206,7 @@ private Object mapValue(Schema schema, JsonNode value) {
                 case ARRAY:
                     Iterable<JsonNode> arrayElements = value::elements;
                     return StreamSupport.stream(arrayElements.spliterator(), false)
-                            .map(elm -> mapValue(schema, elm))
+                            .map(elm -> mapValue(schema.valueSchema(), elm))
                             .collect(Collectors.toList());
                 case NULL:
                 case MISSING:
diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReaderTest.java
index 98e7e5b..446a3f8 100644
--- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReaderTest.java
+++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReaderTest.java
@@ -4,6 +4,7 @@
 import com.fasterxml.jackson.databind.DeserializationFeature;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.fasterxml.jackson.databind.ObjectWriter;
+import com.fasterxml.jackson.databind.node.ArrayNode;
 import com.fasterxml.jackson.databind.node.JsonNodeFactory;
 import com.fasterxml.jackson.databind.node.ObjectNode;
 import org.apache.hadoop.fs.Path;
@@ -16,10 +17,7 @@
 import java.io.IOException;
 import java.io.PrintWriter;
 import java.nio.charset.UnsupportedCharsetException;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.UUID;
+import java.util.*;
 import java.util.stream.IntStream;
 
 import static org.junit.jupiter.api.Assertions.*;
@@ -31,7 +29,8 @@ public class JsonFileReaderTest extends FileReaderTestBase {
     private static final String FIELD_BOOLEAN = "booleanField";
     private static final String FIELD_STRING = "stringField";
     private static final String FIELD_DECIMAL = "decimalField";
-    private static final String FIELD_ARRAY = "arrayField";
+    private static final String FIELD_ARRAY_SIMPLE = "arraySimpleField";
+    private static final String FIELD_ARRAY_COMPLEX = "arrayComplexField";
     private static final String FIELD_STRUCT = "structField";
     private static final String FIELD_NULL = "nullField";
     private static final String FILE_EXTENSION = "jsn";
@@ -53,9 +52,24 @@ protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throw
                         .put(FIELD_BOOLEAN, true)
                         .put(FIELD_DECIMAL, Double.parseDouble(index + "." + index))
                         .put(FIELD_NULL, (String) null);
-                json.putArray(FIELD_ARRAY)
+                json.putArray(FIELD_ARRAY_SIMPLE)
                         .add("elm[" + index + "]")
                         .add("elm[" + (index + 1) + "]");
+                ArrayNode array = json.putArray(FIELD_ARRAY_COMPLEX);
+                array.addObject()
+                        .put(FIELD_INTEGER, index)
+                        .put(FIELD_LONG, Long.MAX_VALUE)
+                        .put(FIELD_STRING, String.format("%d_%s", index, UUID.randomUUID()))
+                        .put(FIELD_BOOLEAN, true)
+                        .put(FIELD_DECIMAL, Double.parseDouble(index + "." + index))
+                        .put(FIELD_NULL, (String) null);
+                array.addObject()
+                        .put(FIELD_INTEGER, index + 1)
+                        .put(FIELD_LONG, Long.MAX_VALUE)
+                        .put(FIELD_STRING, String.format("%d_%s", index, UUID.randomUUID()))
+                        .put(FIELD_BOOLEAN, true)
+                        .put(FIELD_DECIMAL, Double.parseDouble(index + "." + index))
+                        .put(FIELD_NULL, (String) null);
                 json.putObject(FIELD_STRUCT)
                         .put(FIELD_INTEGER, (short) index)
                         .put(FIELD_LONG, Long.MAX_VALUE)
@@ -181,21 +195,38 @@ protected Map<String, Object> getReaderConfig() {
 
     @Override
     protected void checkData(Struct record, long index) {
+        List<Struct> array = record.getArray(FIELD_ARRAY_COMPLEX);
         Struct subrecord = record.getStruct(FIELD_STRUCT);
         assertAll(
-                () -> assertEquals((int) (Integer) record.get(FIELD_INTEGER), index),
-                () -> assertEquals((long) (Long) record.get(FIELD_LONG), Long.MAX_VALUE),
+                () -> assertEquals(index, (int) record.get(FIELD_INTEGER)),
+                () -> assertEquals(Long.MAX_VALUE, (long) record.get(FIELD_LONG)),
                 () -> assertTrue(record.get(FIELD_STRING).toString().startsWith(index + "_")),
                 () -> assertTrue(Boolean.parseBoolean(record.get(FIELD_BOOLEAN).toString())),
-                () -> assertEquals((Double) record.get(FIELD_DECIMAL), Double.parseDouble(index + "." + index), 0),
+                () -> assertEquals(Double.parseDouble(index + "." + index), (Double) record.get(FIELD_DECIMAL), 0),
                 () -> assertNull(record.get(FIELD_NULL)),
                 () -> assertNotNull(record.schema().field(FIELD_NULL)),
-                () -> assertEquals(record.get(FIELD_ARRAY), Arrays.asList("elm[" + index + "]", "elm[" + (index + 1) + "]")),
-                () -> assertEquals((int) (Integer) subrecord.get(FIELD_INTEGER), index),
-                () -> assertEquals((long) (Long) subrecord.get(FIELD_LONG), Long.MAX_VALUE),
+                () -> assertEquals(Arrays.asList("elm[" + index + "]", "elm[" + (index + 1) + "]"), record.get(FIELD_ARRAY_SIMPLE)),
+
+                () -> assertEquals(index, (int) array.get(0).get(FIELD_INTEGER)),
+                () -> assertEquals(Long.MAX_VALUE, (long) array.get(0).get(FIELD_LONG)),
+                () -> assertTrue(array.get(0).get(FIELD_STRING).toString().startsWith(index + "_")),
+                () -> assertTrue(Boolean.parseBoolean(array.get(0).get(FIELD_BOOLEAN).toString())),
+                () -> assertEquals(Double.parseDouble(index + "." + index), (Double) array.get(0).get(FIELD_DECIMAL), 0),
+                () -> assertNull(array.get(0).get(FIELD_NULL)),
+                () -> assertNotNull(array.get(0).schema().field(FIELD_NULL)),
+                () -> assertEquals(index + 1, (int) array.get(1).get(FIELD_INTEGER)),
+                () -> assertEquals(Long.MAX_VALUE, (long) array.get(1).get(FIELD_LONG)),
+                () -> assertTrue(array.get(1).get(FIELD_STRING).toString().startsWith(index + "_")),
+                () -> assertTrue(Boolean.parseBoolean(array.get(1).get(FIELD_BOOLEAN).toString())),
+                () -> assertEquals(Double.parseDouble(index + "." + index), (Double) array.get(1).get(FIELD_DECIMAL), 0),
+                () -> assertNull(array.get(1).get(FIELD_NULL)),
+                () -> assertNotNull(array.get(1).schema().field(FIELD_NULL)),
+
+                () -> assertEquals(index, (int) subrecord.get(FIELD_INTEGER)),
+                () -> assertEquals(Long.MAX_VALUE, (long) subrecord.get(FIELD_LONG)),
                 () -> assertTrue(subrecord.get(FIELD_STRING).toString().startsWith(index + "_")),
                 () -> assertTrue(Boolean.parseBoolean(subrecord.get(FIELD_BOOLEAN).toString())),
-                () -> assertEquals((Double) subrecord.get(FIELD_DECIMAL), Double.parseDouble(index + "." + index), 0),
+                () -> assertEquals(Double.parseDouble(index + "." + index), (Double) subrecord.get(FIELD_DECIMAL), 0),
                 () -> assertNull(subrecord.get(FIELD_NULL)),
                 () -> assertNotNull(subrecord.schema().field(FIELD_NULL))
         );

From 002dc7480c0cae0183bb68466bb9693518e1822c Mon Sep 17 00:00:00 2001
From: Mario Molina <mmolimar@gmail.com>
Date: Sun, 28 Jun 2020 11:23:15 -0500
Subject: [PATCH 10/19] Code style improvements

---
 .../kafka/connect/fs/FsSourceConnector.java   |   2 +-
 .../kafka/connect/fs/FsSourceTask.java        |   4 +-
 .../fs/file/reader/EmptyFileReader.java       |  47 --------
 .../connect/fs/file/reader/FileReader.java    |   2 -
 .../fs/file/reader/JsonFileReader.java        |   2 +-
 .../connect/fs/policy/AbstractPolicy.java     | 107 +++++++++++-------
 .../kafka/connect/fs/util/Iterators.java      |  31 +++++
 .../fs/file/reader/AvroFileReaderTest.java    |   2 +-
 .../fs/file/reader/FileReaderTestBase.java    |   2 +-
 .../fs/file/reader/ParquetFileReaderTest.java |   3 +-
 .../file/reader/SequenceFileReaderTest.java   |   2 +-
 .../file/reader/UnivocityFileReaderTest.java  |  54 ++++-----
 .../fs/policy/HdfsFileWatcherPolicyTest.java  |   4 +-
 .../connect/fs/policy/PolicyTestBase.java     |  19 ++--
 .../connect/fs/policy/SimplePolicyTest.java   |   5 +-
 .../connect/fs/policy/SleepyPolicyTest.java   |   3 +-
 .../connect/fs/task/FsSourceTaskTest.java     |  30 ++---
 17 files changed, 157 insertions(+), 162 deletions(-)
 delete mode 100644 src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/EmptyFileReader.java
 create mode 100644 src/main/java/com/github/mmolimar/kafka/connect/fs/util/Iterators.java

diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceConnector.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceConnector.java
index 839477b..9482e2a 100644
--- a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceConnector.java
+++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceConnector.java
@@ -67,7 +67,7 @@ public List<Map<String, String>> taskConfigs(int maxTasks) {
     @Override
     public void stop() {
         log.info("Stopping FsSourceConnector.");
-        //Nothing to do
+        // Nothing to do
     }
 
     @Override
diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java
index e72206e..0332ab0 100644
--- a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java
+++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java
@@ -91,7 +91,7 @@ public List<SourceRecord> poll() {
                         records.add(convert(metadata, reader.currentOffset() + 1, reader.next()));
                     }
                 } catch (ConnectException | IOException e) {
-                    //when an exception happens reading a file, the connector continues
+                    // when an exception happens reading a file, the connector continues
                     log.error("Error reading file [{}]. Keep going...", metadata.getPath(), e);
                 }
                 log.debug("Read [{}] records from file [{}].", records.size(), metadata.getPath());
@@ -116,7 +116,7 @@ private Stream<FileMetadata> filesToProcess() {
             return asStream(policy.execute())
                     .filter(metadata -> metadata.getLen() > 0);
         } catch (IOException | ConnectException e) {
-            //when an exception happens executing the policy, the connector continues
+            // when an exception happens executing the policy, the connector continues
             log.error("Cannot retrieve files to process from the FS: {}. " +
                             "There was an error executing the policy but the task tolerates this and continues.",
                     policy.getURIs(), e);
diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/EmptyFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/EmptyFileReader.java
deleted file mode 100644
index b6567c8..0000000
--- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/EmptyFileReader.java
+++ /dev/null
@@ -1,47 +0,0 @@
-package com.github.mmolimar.kafka.connect.fs.file.reader;
-
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-
-import java.util.Map;
-
-public class EmptyFileReader extends AbstractFileReader<Void> {
-    /*
-    An empty file reader that will always return no records
-    Used as a null object instead of returning null
-     */
-    private boolean closed;
-
-    public EmptyFileReader(FileSystem fs, Path filePath, Map<String, Object> config) {
-        super(fs, filePath, (Void record) -> null, config);
-        this.closed = false;
-    }
-
-    @Override
-    protected void configure(Map<String, String> config) {
-    }
-
-    @Override
-    protected Void nextRecord() {
-        return null;
-    }
-
-    @Override
-    protected boolean hasNextRecord() {
-        return false;
-    }
-
-    @Override
-    public void seekFile(long offset) {
-    }
-
-    @Override
-    public boolean isClosed() {
-        return this.closed;
-    }
-
-    @Override
-    public void close() {
-        closed = true;
-    }
-}
diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReader.java
index 518e9f8..a5fe758 100644
--- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReader.java
+++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReader.java
@@ -11,8 +11,6 @@ public interface FileReader extends Iterator<Struct>, Closeable {
 
     Path getFilePath();
 
-    boolean hasNext();
-
     Struct next();
 
     void seek(long offset);
diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReader.java
index 4036250..040feb6 100644
--- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReader.java
+++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReader.java
@@ -47,7 +47,7 @@ public JsonFileReader(FileSystem fs, Path filePath, Map<String, Object> config)
         if (hasNext()) {
             String line = inner.nextRecord().getValue();
             this.schema = extractSchema(mapper.readTree(line));
-            //back to the first line
+            // back to the first line
             inner.seek(0);
         } else {
             this.schema = SchemaBuilder.struct().build();
diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java
index a60416d..c90cd68 100644
--- a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java
+++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java
@@ -2,20 +2,19 @@
 
 import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig;
 import com.github.mmolimar.kafka.connect.fs.file.FileMetadata;
-import com.github.mmolimar.kafka.connect.fs.file.reader.EmptyFileReader;
 import com.github.mmolimar.kafka.connect.fs.file.reader.FileReader;
+import com.github.mmolimar.kafka.connect.fs.util.Iterators;
 import com.github.mmolimar.kafka.connect.fs.util.ReflectionUtils;
 import com.github.mmolimar.kafka.connect.fs.util.TailCall;
-import com.google.common.collect.Iterators;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.LocatedFileStatus;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.RemoteIterator;
 import org.apache.kafka.common.utils.Utils;
+import org.apache.kafka.connect.data.Struct;
 import org.apache.kafka.connect.errors.ConnectException;
 import org.apache.kafka.connect.errors.IllegalWorkerStateException;
-import org.apache.kafka.connect.storage.OffsetStorageReader;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -40,7 +39,7 @@ abstract class AbstractPolicy implements Policy {
     private final AtomicLong executions;
     private final boolean recursive;
     private final int batchSize;
-    private Iterator<List<FileMetadata>> previous;
+    private Iterator<Iterator<FileMetadata>> partitions;
     private boolean interrupted;
 
     public AbstractPolicy(FsSourceTaskConfig conf) throws IOException {
@@ -51,7 +50,7 @@ public AbstractPolicy(FsSourceTaskConfig conf) throws IOException {
         this.fileRegexp = Pattern.compile(conf.getString(FsSourceTaskConfig.POLICY_REGEXP));
         this.batchSize = conf.getInt(FsSourceTaskConfig.POLICY_BATCH_SIZE);
         this.interrupted = false;
-        this.previous = Collections.emptyIterator();
+        this.partitions = Collections.emptyIterator();
 
         Map<String, Object> customConfigs = customConfigs();
         logAll(customConfigs);
@@ -112,9 +111,8 @@ public final Iterator<FileMetadata> execute() throws IOException {
         if (hasEnded()) {
             throw new IllegalWorkerStateException("Policy has ended. Cannot be retried.");
         }
-
-        if (batchSize > 0 && previous.hasNext()) {
-            return previous.next().iterator();
+        if (partitions.hasNext()) {
+            return partitions.next();
         }
 
         preCheck();
@@ -124,16 +122,11 @@ public final Iterator<FileMetadata> execute() throws IOException {
         for (FileSystem fs : fileSystems) {
             files = concat(files, listFiles(fs));
         }
-        postCheck();
 
-        if (batchSize > 0) {
-            previous = Iterators.partition(files, batchSize);
-            if (!previous.hasNext())
-                return Collections.emptyIterator();
-            return previous.next().iterator();
-        }
+        postCheck();
 
-        return files;
+        partitions = Iterators.partition(files, batchSize);
+        return partitions.hasNext() ? partitions.next() : Collections.emptyIterator();
     }
 
     @Override
@@ -193,8 +186,7 @@ public final boolean hasEnded() {
         if (interrupted) {
             return true;
         }
-
-        return !previous.hasNext() && isPolicyCompleted();
+        return !partitions.hasNext() && isPolicyCompleted();
     }
 
     protected abstract boolean isPolicyCompleted();
@@ -213,7 +205,7 @@ FileMetadata toMetadata(LocatedFileStatus fileStatus) {
     }
 
     @Override
-    public FileReader offer(FileMetadata metadata, Map<String, Object> offset) {
+    public FileReader offer(FileMetadata metadata, Map<String, Object> offsetMap) {
         FileSystem current = fileSystems.stream()
                 .filter(fs -> metadata.getPath().startsWith(fs.getWorkingDirectory().toString()))
                 .findFirst()
@@ -224,31 +216,33 @@ public FileReader offer(FileMetadata metadata, Map<String, Object> offset) {
                 current, new Path(metadata.getPath()), conf.originals()
         );
         try {
-            if (offset != null && offset.get("offset") != null) {
-                Object offsetObject = offset.get("offset");
-                Object fileSizeBytesObject = offset.get("fileSizeBytes");
-
-                // Only new versions of kafka-connect-fs store the file size bytes
-                // If we have the byte offset, we can skip reading the entire file if the file size has not changed
-                if (fileSizeBytesObject != null) {
-                    Long byteOffset = (Long) fileSizeBytesObject;
-                    if (metadata.getLen() == byteOffset) {
-                        log.info("Skipping file [{}] due to it was already processed.", metadata.getPath());
-                        return new EmptyFileReader(current, new Path(metadata.getPath()), conf.originals());
-                    }
-                }
-
-                log.info("Seeking to offset [{}] for file [{}].", offset.get("offset"), metadata.getPath());
-                FileReader reader = makeReader.get();
-                reader.seek((Long) offsetObject);
-                return reader;
-            }
-            return makeReader.get();
+            return Optional.ofNullable(offsetMap.get("offset"))
+                    .map(offset -> Long.parseLong(offset.toString()))
+                    .filter(offset -> offset > 0)
+                    .map(offset -> {
+                        long fileSizeBytes = Long.parseLong(offsetMap.getOrDefault("fileSizeBytes", "0").toString());
+                        if (metadata.getLen() == fileSizeBytes) {
+                            log.info("Skipping file [{}] due to it was already processed.", metadata.getPath());
+                            return emptyFileReader(new Path(metadata.getPath()));
+                        } else {
+                            log.info("Seeking to offset [{}] for file [{}].", offsetMap.get("offset"), metadata.getPath());
+                            FileReader reader = makeReader.get();
+                            reader.seek(offset);
+                            return reader;
+                        }
+                    }).orElse(makeReader.get());
         } catch (Exception e) {
             throw new ConnectException("An error has occurred when creating reader for file: " + metadata.getPath(), e);
         }
     }
 
+    @Override
+    public void close() throws IOException {
+        for (FileSystem fs : fileSystems) {
+            fs.close();
+        }
+    }
+
     private Iterator<FileMetadata> concat(final Iterator<FileMetadata> it1, final Iterator<FileMetadata> it2) {
         return new Iterator<FileMetadata>() {
 
@@ -264,11 +258,36 @@ public FileMetadata next() {
         };
     }
 
-    @Override
-    public void close() throws IOException {
-        for (FileSystem fs : fileSystems) {
-            fs.close();
-        }
+    private FileReader emptyFileReader(Path filePath) {
+        return new FileReader() {
+            @Override
+            public Path getFilePath() {
+                return filePath;
+            }
+
+            @Override
+            public Struct next() {
+                throw new NoSuchElementException();
+            }
+
+            @Override
+            public void seek(long offset) {
+            }
+
+            @Override
+            public long currentOffset() {
+                return 0;
+            }
+
+            @Override
+            public void close() {
+            }
+
+            @Override
+            public boolean hasNext() {
+                return false;
+            }
+        };
     }
 
     private void logAll(Map<String, Object> conf) {
diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/util/Iterators.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/util/Iterators.java
new file mode 100644
index 0000000..3cbf32d
--- /dev/null
+++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/util/Iterators.java
@@ -0,0 +1,31 @@
+package com.github.mmolimar.kafka.connect.fs.util;
+
+import java.util.*;
+
+public class Iterators {
+
+    public static <T> Iterator<Iterator<T>> partition(Iterator<T> it, int size) {
+        if (size <= 0) {
+            return Collections.singletonList(it).iterator();
+        }
+
+        return new Iterator<Iterator<T>>() {
+            @Override
+            public boolean hasNext() {
+                return it.hasNext();
+            }
+
+            @Override
+            public Iterator<T> next() {
+                if (!hasNext()) {
+                    throw new NoSuchElementException();
+                }
+                List<T> elements = new ArrayList<>(size);
+                while (it.hasNext() && elements.size() < size) {
+                    elements.add(it.next());
+                }
+                return elements.iterator();
+            }
+        };
+    }
+}
diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReaderTest.java
index 5e9d59e..261af40 100644
--- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReaderTest.java
+++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReaderTest.java
@@ -123,7 +123,7 @@ protected Map<String, Object> getReaderConfig() {
     @Override
     protected void checkData(Struct record, long index) {
         assertAll(
-                () -> assertEquals((int) (Integer) record.get(FIELD_INDEX), index),
+                () -> assertEquals(index, (int) record.get(FIELD_INDEX)),
                 () -> assertTrue(record.get(FIELD_NAME).toString().startsWith(index + "_")),
                 () -> assertTrue(record.get(FIELD_SURNAME).toString().startsWith(index + "_"))
         );
diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReaderTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReaderTestBase.java
index f21cf49..0560c6c 100644
--- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReaderTestBase.java
+++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReaderTestBase.java
@@ -60,7 +60,7 @@ public void closeReader() {
             try {
                 fsConfig.getReader().close();
             } catch (Exception e) {
-                //ignoring
+                // ignoring
             }
         }
     }
diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReaderTest.java
index 30dd425..4531808 100644
--- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReaderTest.java
+++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReaderTest.java
@@ -192,7 +192,7 @@ protected Class<? extends FileReader> getReaderClass() {
 
     @Override
     protected void checkData(Struct record, long index) {
-        assertEquals((int) (Integer) record.get(FIELD_INDEX), index);
+        assertEquals(index, (int) record.get(FIELD_INDEX));
         assertTrue(record.get(FIELD_NAME).toString().startsWith(index + "_"));
         assertTrue(record.get(FIELD_SURNAME).toString().startsWith(index + "_"));
     }
@@ -201,5 +201,4 @@ protected void checkData(Struct record, long index) {
     protected String getFileExtension() {
         return FILE_EXTENSION;
     }
-
 }
diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReaderTest.java
index e70d3dd..2b23c99 100644
--- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReaderTest.java
+++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReaderTest.java
@@ -149,7 +149,7 @@ protected void checkData(Struct record, long index) {
 
     private void checkData(String keyFieldName, String valueFieldName, Struct record, long index) {
         assertAll(
-                () -> assertEquals((int) (Integer) record.get(keyFieldName), index),
+                () -> assertEquals(index, (int) record.get(keyFieldName)),
                 () -> assertTrue(record.get(valueFieldName).toString().startsWith(index + "_"))
         );
     }
diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReaderTest.java
index 79663bc..27c0edf 100644
--- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReaderTest.java
+++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReaderTest.java
@@ -249,43 +249,43 @@ protected Class<? extends FileReader> getReaderClass() {
     @Override
     protected void checkData(Struct record, long index) {
         assertAll(
-                () -> assertEquals(record.get(FIELD_COLUMN1), (byte) 2),
-                () -> assertEquals(record.get(FIELD_COLUMN2), (short) 4),
-                () -> assertEquals(record.get(FIELD_COLUMN3), 8),
-                () -> assertEquals(record.get(FIELD_COLUMN4), 16L),
-                () -> assertEquals(record.get(FIELD_COLUMN5), 32.32f),
-                () -> assertEquals(record.get(FIELD_COLUMN6), 64.64d),
-                () -> assertEquals(record.get(FIELD_COLUMN7), true),
-                () -> assertEquals(new String((byte[]) record.get(FIELD_COLUMN8)), "test bytes"),
-                () -> assertEquals(record.get(FIELD_COLUMN9), "test string")
+                () -> assertEquals((byte) 2, record.get(FIELD_COLUMN1)),
+                () -> assertEquals((short) 4, record.get(FIELD_COLUMN2)),
+                () -> assertEquals(8, record.get(FIELD_COLUMN3)),
+                () -> assertEquals(16L, record.get(FIELD_COLUMN4)),
+                () -> assertEquals(32.32f, record.get(FIELD_COLUMN5)),
+                () -> assertEquals(64.64d, record.get(FIELD_COLUMN6)),
+                () -> assertEquals(true, record.get(FIELD_COLUMN7)),
+                () -> assertEquals("test bytes", new String((byte[]) record.get(FIELD_COLUMN8))),
+                () -> assertEquals("test string", record.get(FIELD_COLUMN9))
         );
     }
 
     protected void checkDataString(Struct record) {
         assertAll(
-                () -> assertEquals(record.get(FIELD_COLUMN1), "2"),
-                () -> assertEquals(record.get(FIELD_COLUMN2), "4"),
-                () -> assertEquals(record.get(FIELD_COLUMN3), "8"),
-                () -> assertEquals(record.get(FIELD_COLUMN4), "16"),
-                () -> assertEquals(record.get(FIELD_COLUMN5), "32.320000"),
-                () -> assertEquals(record.get(FIELD_COLUMN6), "64.640000"),
-                () -> assertEquals(record.get(FIELD_COLUMN7), "true"),
-                () -> assertEquals(record.get(FIELD_COLUMN8), "test bytes"),
-                () -> assertEquals(record.get(FIELD_COLUMN9), "test string")
+                () -> assertEquals("2", record.get(FIELD_COLUMN1)),
+                () -> assertEquals("4", record.get(FIELD_COLUMN2)),
+                () -> assertEquals("8", record.get(FIELD_COLUMN3)),
+                () -> assertEquals("16", record.get(FIELD_COLUMN4)),
+                () -> assertEquals("32.320000", record.get(FIELD_COLUMN5)),
+                () -> assertEquals("64.640000", record.get(FIELD_COLUMN6)),
+                () -> assertEquals("true", record.get(FIELD_COLUMN7)),
+                () -> assertEquals("test bytes", record.get(FIELD_COLUMN8)),
+                () -> assertEquals("test string", record.get(FIELD_COLUMN9))
         );
     }
 
     protected void checkDataNull(Struct record) {
         assertAll(
-                () -> assertEquals(record.get(FIELD_COLUMN1), null),
-                () -> assertEquals(record.get(FIELD_COLUMN2), null),
-                () -> assertEquals(record.get(FIELD_COLUMN3), null),
-                () -> assertEquals(record.get(FIELD_COLUMN4), null),
-                () -> assertEquals(record.get(FIELD_COLUMN5), null),
-                () -> assertEquals(record.get(FIELD_COLUMN6), null),
-                () -> assertEquals(record.get(FIELD_COLUMN7), null),
-                () -> assertEquals(record.get(FIELD_COLUMN8), null),
-                () -> assertEquals(record.get(FIELD_COLUMN9), null)
+                () -> assertNull(record.get(FIELD_COLUMN1)),
+                () -> assertNull(record.get(FIELD_COLUMN2)),
+                () -> assertNull(record.get(FIELD_COLUMN3)),
+                () -> assertNull(record.get(FIELD_COLUMN4)),
+                () -> assertNull(record.get(FIELD_COLUMN5)),
+                () -> assertNull(record.get(FIELD_COLUMN6)),
+                () -> assertNull(record.get(FIELD_COLUMN7)),
+                () -> assertNull(record.get(FIELD_COLUMN8)),
+                () -> assertNull(record.get(FIELD_COLUMN9))
         );
     }
 
diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/HdfsFileWatcherPolicyTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/HdfsFileWatcherPolicyTest.java
index 3cd9c75..f04db50 100644
--- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/HdfsFileWatcherPolicyTest.java
+++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/HdfsFileWatcherPolicyTest.java
@@ -50,7 +50,7 @@ protected FsSourceTaskConfig buildSourceTaskConfig(List<Path> directories) {
         return new FsSourceTaskConfig(cfg);
     }
 
-    //This policy does not throw any exception. Just stop watching those nonexistent dirs
+    // This policy does not throw any exception. Just stop watching those nonexistent dirs
     @ParameterizedTest
     @MethodSource("fileSystemConfigProvider")
     @Override
@@ -67,7 +67,7 @@ public void invalidDirectory(PolicyFsTestConfig fsConfig) throws IOException {
         }
     }
 
-    //This policy never ends. We have to interrupt it
+    // This policy never ends. We have to interrupt it
     @ParameterizedTest
     @MethodSource("fileSystemConfigProvider")
     @Override
diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java
index 35fb5e9..c5ed6f7 100644
--- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java
+++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java
@@ -128,11 +128,11 @@ public void oneFilePerFs(PolicyFsTestConfig fsConfig) throws IOException, Interr
         FileSystem fs = fsConfig.getFs();
         for (Path dir : fsConfig.getDirectories()) {
             fs.createNewFile(new Path(dir, System.nanoTime() + ".txt"));
-            //this file does not match the regexp
+            // this file does not match the regexp
             fs.createNewFile(new Path(dir, System.nanoTime() + ".invalid"));
 
-            //we wait till FS has registered the files
-            Thread.sleep(3000);
+            // we wait till FS has registered the files
+            Thread.sleep(5000);
         }
         Iterator<FileMetadata> it = fsConfig.getPolicy().execute();
         assertTrue(it.hasNext());
@@ -150,10 +150,10 @@ public void recursiveDirectory(PolicyFsTestConfig fsConfig) throws IOException,
             Path tmpDir = new Path(dir, String.valueOf(System.nanoTime()));
             fs.mkdirs(tmpDir);
             fs.createNewFile(new Path(tmpDir, System.nanoTime() + ".txt"));
-            //this file does not match the regexp
+            // this file does not match the regexp
             fs.createNewFile(new Path(tmpDir, System.nanoTime() + ".invalid"));
 
-            //we wait till FS has registered the files
+            // we wait till FS has registered the files
             Thread.sleep(3000);
         }
         Iterator<FileMetadata> it = fsConfig.getPolicy().execute();
@@ -236,10 +236,10 @@ public void execPolicyBatchesFiles(PolicyFsTestConfig fsConfig) throws IOExcepti
             FileSystem fs = fsConfig.getFs();
             for (Path dir : fsConfig.getDirectories()) {
                 fs.createNewFile(new Path(dir, System.nanoTime() + ".txt"));
-                //this file does not match the regexp
+                // this file does not match the regexp
                 fs.createNewFile(new Path(dir, System.nanoTime() + ".invalid"));
 
-                //we wait till FS has registered the files
+                // we wait till FS has registered the files
                 Thread.sleep(3000);
             }
             
@@ -263,10 +263,7 @@ public void execPolicyBatchesFiles(PolicyFsTestConfig fsConfig) throws IOExcepti
     public void invalidBatchSize(PolicyFsTestConfig fsConfig) {
         Map<String, String> originals = fsConfig.getSourceTaskConfig().originalsStrings();
         originals.put(FsSourceTaskConfig.POLICY_BATCH_SIZE, "one");
-        assertThrows(ConfigException.class, () -> {
-            new FsSourceTaskConfig(originals);
-        });
-
+        assertThrows(ConfigException.class, () -> new FsSourceTaskConfig(originals));
     }
 
     protected abstract FsSourceTaskConfig buildSourceTaskConfig(List<Path> directories);
diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/SimplePolicyTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/SimplePolicyTest.java
index 42bfe7e..f70e184 100644
--- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/SimplePolicyTest.java
+++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/SimplePolicyTest.java
@@ -50,10 +50,10 @@ public void execPolicyEndsAfterBatching(PolicyFsTestConfig fsConfig) throws IOEx
             FileSystem fs = fsConfig.getFs();
             for (Path dir : fsConfig.getDirectories()) {
                 fs.createNewFile(new Path(dir, System.nanoTime() + ".txt"));
-                //this file does not match the regexp
+                // this file does not match the regexp
                 fs.createNewFile(new Path(dir, System.nanoTime() + ".invalid"));
 
-                //we wait till FS has registered the files
+                // we wait till FS has registered the files
                 Thread.sleep(3000);
             }
             
@@ -74,5 +74,4 @@ public void execPolicyEndsAfterBatching(PolicyFsTestConfig fsConfig) throws IOEx
             assertTrue(policy.hasEnded());
         }
     }
-
 }
diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/SleepyPolicyTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/SleepyPolicyTest.java
index 09e592c..8f340b0 100644
--- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/SleepyPolicyTest.java
+++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/SleepyPolicyTest.java
@@ -121,7 +121,7 @@ public void defaultExecutions(PolicyFsTestConfig fsConfig) throws IOException {
 
         try (Policy policy = ReflectionUtils.makePolicy((Class<? extends Policy>) fsConfig.getSourceTaskConfig()
                 .getClass(FsSourceTaskConfig.POLICY_CLASS), sleepConfig)) {
-            //it never ends
+            // it never ends
             for (int i = 0; i < 100; i++) {
                 assertFalse(policy.hasEnded());
                 policy.execute();
@@ -130,5 +130,4 @@ public void defaultExecutions(PolicyFsTestConfig fsConfig) throws IOException {
             assertTrue(policy.hasEnded());
         }
     }
-
 }
diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTest.java
index 5ba6cd5..dd05ba8 100644
--- a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTest.java
+++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTest.java
@@ -75,7 +75,7 @@ public void initTask() {
                 put(FsSourceTaskConfig.POLICY_REGEXP, "^[0-9]*\\.txt$");
             }};
 
-            //Mock initialization
+            // Mock initialization
             SourceTaskContext taskContext = PowerMock.createMock(SourceTaskContext.class);
             OffsetStorageReader offsetStorageReader = PowerMock.createMock(OffsetStorageReader.class);
 
@@ -84,7 +84,7 @@ public void initTask() {
                     .times(2);
 
             // Every time the `offsetStorageReader.offsets(params)` method is called we want to capture the offsets params
-            // And return a different result based on the offset params passed in
+            // and return a different result based on the offset params passed in
             // In this case, returning a different result based on the file path of the params
             Capture<Collection<Map<String, Object>>> captureOne = Capture.newInstance(CaptureType.ALL);
             AtomicInteger executionNumber = new AtomicInteger();
@@ -145,7 +145,7 @@ private static Stream<Arguments> fileSystemConfigProvider() {
     public void pollNoData(TaskFsTestConfig fsConfig) {
         fsConfig.getTask().start(fsConfig.getTaskConfig());
         assertEquals(0, fsConfig.getTask().poll().size());
-        //policy has ended
+        // policy has ended
         assertNull(fsConfig.getTask().poll());
     }
 
@@ -154,12 +154,12 @@ public void pollNoData(TaskFsTestConfig fsConfig) {
     public void emptyFilesToProcess(TaskFsTestConfig fsConfig) throws IOException {
         for (Path dir : fsConfig.getDirectories()) {
             fsConfig.getFs().createNewFile(new Path(dir, System.nanoTime() + ".txt"));
-            //this file does not match the regexp
+            // this file does not match the regexp
             fsConfig.getFs().createNewFile(new Path(dir, String.valueOf(System.nanoTime())));
         }
         fsConfig.getTask().start(fsConfig.getTaskConfig());
         assertEquals(0, fsConfig.getTask().poll().size());
-        //policy has ended
+        // policy has ended
         assertNull(fsConfig.getTask().poll());
     }
 
@@ -169,7 +169,7 @@ public void oneFilePerFs(TaskFsTestConfig fsConfig) throws IOException {
         for (Path dir : fsConfig.getDirectories()) {
             Path dataFile = new Path(dir, System.nanoTime() + ".txt");
             createDataFile(fsConfig.getFs(), dataFile);
-            //this file does not match the regexp
+            // this file does not match the regexp
             fsConfig.getFs().createNewFile(new Path(dir, String.valueOf(System.nanoTime())));
         }
 
@@ -177,7 +177,7 @@ public void oneFilePerFs(TaskFsTestConfig fsConfig) throws IOException {
         List<SourceRecord> records = fsConfig.getTask().poll();
         assertEquals((NUM_RECORDS * fsConfig.getDirectories().size()) / 2, records.size());
         checkRecords(records);
-        //policy has ended
+        // policy has ended
         assertNull(fsConfig.getTask().poll());
     }
 
@@ -185,7 +185,7 @@ public void oneFilePerFs(TaskFsTestConfig fsConfig) throws IOException {
     @MethodSource("fileSystemConfigProvider")
     public void skipsFetchingFileIfByteOffsetExistsAndMatchesFileLength(TaskFsTestConfig fsConfig) throws IOException {
         for (Path dir : fsConfig.getDirectories()) {
-            //this file will be skipped since the byte offset for the file is equal to the byte size of the file
+            // this file will be skipped since the byte offset for the file is equal to the byte size of the file
             Path dataFile = new Path(dir, FILE_ALREADY_PROCESSED);
             createDataFile(fsConfig.getFs(), dataFile);
         }
@@ -317,7 +317,7 @@ public void pollNoDataWithBatch(TaskFsTestConfig fsConfig) {
         fsConfig.getTask().start(props);
 
         assertEquals(0, fsConfig.getTask().poll().size());
-        //policy has ended
+        // policy has ended
         assertNull(fsConfig.getTask().poll());
     }
 
@@ -326,7 +326,7 @@ public void pollNoDataWithBatch(TaskFsTestConfig fsConfig) {
     public void emptyFilesToProcessWithBatch(TaskFsTestConfig fsConfig) throws IOException {
         for (Path dir : fsConfig.getDirectories()) {
             fsConfig.getFs().createNewFile(new Path(dir, System.nanoTime() + ".txt"));
-            //this file does not match the regexp
+            // this file does not match the regexp
             fsConfig.getFs().createNewFile(new Path(dir, String.valueOf(System.nanoTime())));
         }
         Map<String, String> props = new HashMap<>(fsConfig.getTaskConfig());
@@ -341,7 +341,7 @@ public void emptyFilesToProcessWithBatch(TaskFsTestConfig fsConfig) throws IOExc
         }
         assertEquals(0, records.size());
 
-        //policy has ended
+        // policy has ended
         assertNull(fsConfig.getTask().poll());
     }
 
@@ -351,7 +351,7 @@ public void oneFilePerFsWithBatch(TaskFsTestConfig fsConfig) throws IOException
         for (Path dir : fsConfig.getDirectories()) {
             Path dataFile = new Path(dir, System.nanoTime() + ".txt");
             createDataFile(fsConfig.getFs(), dataFile);
-            //this file does not match the regexp
+            // this file does not match the regexp
             fsConfig.getFs().createNewFile(new Path(dir, String.valueOf(System.nanoTime())));
         }
 
@@ -368,7 +368,7 @@ public void oneFilePerFsWithBatch(TaskFsTestConfig fsConfig) throws IOException
 
         assertEquals((NUM_RECORDS * fsConfig.getDirectories().size()) / 2, records.size());
         checkRecords(records);
-        //policy has ended
+        // policy has ended
         assertNull(fsConfig.getTask().poll());
     }
 
@@ -382,7 +382,7 @@ public void shouldNotSleepBetweenBatches(TaskFsTestConfig fsConfig) throws IOExc
         for (Path dir : fsConfig.getDirectories()) {
             Path dataFile = new Path(dir, System.nanoTime() + ".txt");
             createDataFile(fsConfig.getFs(), dataFile);
-            //this file does not match the regexp
+            // this file does not match the regexp
             fsConfig.getFs().createNewFile(new Path(dir, String.valueOf(System.nanoTime())));
         }
 
@@ -396,7 +396,7 @@ public void shouldNotSleepBetweenBatches(TaskFsTestConfig fsConfig) throws IOExc
 
         assertEquals((NUM_RECORDS * fsConfig.getDirectories().size()) / 2, records.size());
         checkRecords(records);
-        //policy has ended
+        // policy has ended
         assertNull(fsConfig.getTask().poll());
     }
 

From af962e8d979b9cf85f71ba7fa41fd58e02bdbf80 Mon Sep 17 00:00:00 2001
From: Mario Molina <mmolimar@gmail.com>
Date: Sun, 28 Jun 2020 14:19:20 -0500
Subject: [PATCH 11/19] Map float and double data types in JSON reader

---
 .../connect/fs/file/reader/JsonFileReader.java   | 14 ++------------
 .../fs/file/reader/JsonFileReaderTest.java       |  7 +++++++
 .../kafka/connect/fs/policy/PolicyTestBase.java  | 16 ++++++++++++++--
 3 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReader.java
index 040feb6..c89c741 100644
--- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReader.java
+++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReader.java
@@ -115,14 +115,8 @@ private static Schema extractSchema(JsonNode jsonNode) {
                     return Schema.OPTIONAL_INT32_SCHEMA;
                 } else if (jsonNode.isLong()) {
                     return Schema.OPTIONAL_INT64_SCHEMA;
-                } else if (jsonNode.isFloat()) {
-                    return Schema.OPTIONAL_FLOAT32_SCHEMA;
-                } else if (jsonNode.isDouble()) {
-                    return Schema.OPTIONAL_FLOAT64_SCHEMA;
                 } else if (jsonNode.isBigInteger()) {
                     return Schema.OPTIONAL_INT64_SCHEMA;
-                } else if (jsonNode.isBigDecimal()) {
-                    return Schema.OPTIONAL_FLOAT64_SCHEMA;
                 } else {
                     return Schema.OPTIONAL_FLOAT64_SCHEMA;
                 }
@@ -177,14 +171,10 @@ private Object mapValue(Schema schema, JsonNode value) {
                         return value.intValue();
                     } else if (value.isLong()) {
                         return value.longValue();
-                    } else if (value.isFloat()) {
-                        return value.floatValue();
-                    } else if (value.isDouble()) {
-                        return value.doubleValue();
                     } else if (value.isBigInteger()) {
-                        return value.bigIntegerValue();
+                        return value.bigIntegerValue().longValue();
                     } else {
-                        return value.numberValue();
+                        return value.numberValue().doubleValue();
                     }
                 case STRING:
                     return value.asText();
diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReaderTest.java
index 446a3f8..f7f6da0 100644
--- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReaderTest.java
+++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReaderTest.java
@@ -16,6 +16,7 @@
 import java.io.File;
 import java.io.IOException;
 import java.io.PrintWriter;
+import java.math.BigInteger;
 import java.nio.charset.UnsupportedCharsetException;
 import java.util.*;
 import java.util.stream.IntStream;
@@ -25,10 +26,12 @@
 public class JsonFileReaderTest extends FileReaderTestBase {
 
     private static final String FIELD_INTEGER = "integerField";
+    private static final String FIELD_BIG_INTEGER = "bigIntegerField";
     private static final String FIELD_LONG = "longField";
     private static final String FIELD_BOOLEAN = "booleanField";
     private static final String FIELD_STRING = "stringField";
     private static final String FIELD_DECIMAL = "decimalField";
+    private static final String FIELD_BINARY = "binaryField";
     private static final String FIELD_ARRAY_SIMPLE = "arraySimpleField";
     private static final String FIELD_ARRAY_COMPLEX = "arrayComplexField";
     private static final String FIELD_STRUCT = "structField";
@@ -47,10 +50,12 @@ protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throw
             IntStream.range(0, numRecords).forEach(index -> {
                 ObjectNode json = JsonNodeFactory.instance.objectNode()
                         .put(FIELD_INTEGER, index)
+                        .put(FIELD_BIG_INTEGER, new BigInteger("9999999999999999999"))
                         .put(FIELD_LONG, Long.MAX_VALUE)
                         .put(FIELD_STRING, String.format("%d_%s", index, UUID.randomUUID()))
                         .put(FIELD_BOOLEAN, true)
                         .put(FIELD_DECIMAL, Double.parseDouble(index + "." + index))
+                        .put(FIELD_BINARY, "test".getBytes())
                         .put(FIELD_NULL, (String) null);
                 json.putArray(FIELD_ARRAY_SIMPLE)
                         .add("elm[" + index + "]")
@@ -199,12 +204,14 @@ protected void checkData(Struct record, long index) {
         Struct subrecord = record.getStruct(FIELD_STRUCT);
         assertAll(
                 () -> assertEquals(index, (int) record.get(FIELD_INTEGER)),
+                () -> assertEquals(new BigInteger("9999999999999999999").longValue(), record.get(FIELD_BIG_INTEGER)),
                 () -> assertEquals(Long.MAX_VALUE, (long) record.get(FIELD_LONG)),
                 () -> assertTrue(record.get(FIELD_STRING).toString().startsWith(index + "_")),
                 () -> assertTrue(Boolean.parseBoolean(record.get(FIELD_BOOLEAN).toString())),
                 () -> assertEquals(Double.parseDouble(index + "." + index), (Double) record.get(FIELD_DECIMAL), 0),
                 () -> assertNull(record.get(FIELD_NULL)),
                 () -> assertNotNull(record.schema().field(FIELD_NULL)),
+                () -> assertEquals("dGVzdA==", record.get(FIELD_BINARY)),
                 () -> assertEquals(Arrays.asList("elm[" + index + "]", "elm[" + (index + 1) + "]"), record.get(FIELD_ARRAY_SIMPLE)),
 
                 () -> assertEquals(index, (int) array.get(0).get(FIELD_INTEGER)),
diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java
index c5ed6f7..237a236 100644
--- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java
+++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java
@@ -2,6 +2,7 @@
 
 import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig;
 import com.github.mmolimar.kafka.connect.fs.file.FileMetadata;
+import com.github.mmolimar.kafka.connect.fs.file.reader.FileReader;
 import com.github.mmolimar.kafka.connect.fs.util.ReflectionUtils;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
@@ -242,12 +243,23 @@ public void execPolicyBatchesFiles(PolicyFsTestConfig fsConfig) throws IOExcepti
                 // we wait till FS has registered the files
                 Thread.sleep(3000);
             }
-            
+
             Iterator<FileMetadata> it = policy.execute();
 
             // First batch of files (1 file)
             assertTrue(it.hasNext());
-            String firstPath = it.next().getPath();
+            FileMetadata metadata = it.next();
+            String firstPath = metadata.getPath();
+            FileReader reader = policy.offer(metadata, new HashMap<String, Object>() {{
+                put("offset", "1");
+                put("fileSizeBytes", "0");
+            }});
+            assertFalse(reader.hasNext());
+            reader.seek(1000L);
+            reader.close();
+            assertEquals(0L, reader.currentOffset());
+            assertEquals(metadata.getPath(), reader.getFilePath().toString());
+            assertThrows(NoSuchElementException.class, reader::next);
             assertFalse(it.hasNext());
 
             // Second batch of files (1 file)

From e0eb5332c611ff6089cd62eb801780bd8aa2e0ec Mon Sep 17 00:00:00 2001
From: Grant Nicholas <43971820+grantatspothero@users.noreply.github.com>
Date: Fri, 3 Jul 2020 18:30:42 -0500
Subject: [PATCH 12/19] Make makeReader run only if reading the file is
 necessary (#69)

* Make makeReader run only if reading the file is necessary

This is really important because if we open up a filehandle for reading some hadoop filesystem implementations (like s3a) will open up a connection per file and these connections will get exhausted quickly

* Handle 'null' from the offsets map

* handle nulls better

* Minor changes

Co-authored-by: Mario Molina <mmolimar@gmail.com>
---
 .../com/github/mmolimar/kafka/connect/fs/FsSourceTask.java     | 3 ++-
 .../mmolimar/kafka/connect/fs/policy/AbstractPolicy.java       | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java
index 0332ab0..9738c3e 100644
--- a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java
+++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java
@@ -85,7 +85,8 @@ public List<SourceRecord> poll() {
 
             List<SourceRecord> totalRecords = filesToProcess.stream().map(metadata -> {
                 List<SourceRecord> records = new ArrayList<>();
-                try (FileReader reader = policy.offer(metadata, offsets.get(makePartitionKey.apply(metadata)))) {
+                Map<String, Object> partitionKey = makePartitionKey.apply(metadata);
+                try (FileReader reader = policy.offer(metadata, offsets.getOrDefault(partitionKey, new HashMap<>()))) {
                     log.info("Processing records for file {}.", metadata);
                     while (reader.hasNext()) {
                         records.add(convert(metadata, reader.currentOffset() + 1, reader.next()));
diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java
index c90cd68..5ec4dba 100644
--- a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java
+++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java
@@ -230,7 +230,7 @@ current, new Path(metadata.getPath()), conf.originals()
                             reader.seek(offset);
                             return reader;
                         }
-                    }).orElse(makeReader.get());
+                    }).orElseGet(makeReader);
         } catch (Exception e) {
             throw new ConnectException("An error has occurred when creating reader for file: " + metadata.getPath(), e);
         }

From 330f2130f4a866056bfb832e86e57943bfe0faee Mon Sep 17 00:00:00 2001
From: Mario Molina <mmolimar@gmail.com>
Date: Sat, 4 Jul 2020 08:53:15 -0500
Subject: [PATCH 13/19] Batching support in file readers

---
 .../kafka/connect/fs/FsSourceTask.java        | 21 +++--
 .../kafka/connect/fs/FsSourceTaskConfig.java  | 17 +++-
 .../fs/file/reader/AbstractFileReader.java    | 72 +++++++++++------
 .../fs/file/reader/AvroFileReader.java        | 11 ++-
 .../fs/file/reader/SequenceFileReader.java    | 24 +++---
 .../fs/file/reader/TextFileReader.java        |  3 +-
 .../connect/fs/policy/AbstractPolicy.java     |  5 +-
 .../kafka/connect/fs/policy/SleepyPolicy.java | 32 +++-----
 .../fs/file/reader/AvroFileReaderTest.java    |  2 +-
 .../fs/file/reader/FileReaderTestBase.java    | 26 +++++-
 .../file/reader/SequenceFileReaderTest.java   | 12 +--
 .../connect/fs/policy/PolicyTestBase.java     |  5 +-
 .../connect/fs/task/FsSourceTaskTest.java     | 79 ++++++++++++++++++-
 13 files changed, 226 insertions(+), 83 deletions(-)

diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java
index 9738c3e..f150cba 100644
--- a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java
+++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java
@@ -1,6 +1,7 @@
 package com.github.mmolimar.kafka.connect.fs;
 
 import com.github.mmolimar.kafka.connect.fs.file.FileMetadata;
+import com.github.mmolimar.kafka.connect.fs.file.reader.AbstractFileReader;
 import com.github.mmolimar.kafka.connect.fs.file.reader.FileReader;
 import com.github.mmolimar.kafka.connect.fs.policy.Policy;
 import com.github.mmolimar.kafka.connect.fs.util.ReflectionUtils;
@@ -86,10 +87,15 @@ public List<SourceRecord> poll() {
             List<SourceRecord> totalRecords = filesToProcess.stream().map(metadata -> {
                 List<SourceRecord> records = new ArrayList<>();
                 Map<String, Object> partitionKey = makePartitionKey.apply(metadata);
-                try (FileReader reader = policy.offer(metadata, offsets.getOrDefault(partitionKey, new HashMap<>()))) {
+                Map<String, Object> offset = Optional.ofNullable(offsets.get(partitionKey)).orElse(new HashMap<>());
+                try (FileReader reader = policy.offer(metadata, offset)) {
                     log.info("Processing records for file {}.", metadata);
                     while (reader.hasNext()) {
-                        records.add(convert(metadata, reader.currentOffset() + 1, reader.next()));
+                        Struct record = reader.next();
+                        // TODO change FileReader interface in the next major version
+                        boolean hasNext = (reader instanceof AbstractFileReader) ?
+                                ((AbstractFileReader) reader).hasNextBatch() || reader.hasNext() : reader.hasNext();
+                        records.add(convert(metadata, reader.currentOffset(), !hasNext, record));
                     }
                 } catch (ConnectException | IOException e) {
                     // when an exception happens reading a file, the connector continues
@@ -130,13 +136,14 @@ private <T> Stream<T> asStream(Iterator<T> src) {
         return StreamSupport.stream(iterable.spliterator(), false);
     }
 
-    private SourceRecord convert(FileMetadata metadata, long offset, Struct struct) {
-        Map<String, Long> offsetMap = new HashMap<>();
-        offsetMap.put("offset", offset);
-        offsetMap.put("fileSizeBytes", metadata.getLen());
+    private SourceRecord convert(FileMetadata metadata, long offset, boolean eof, Struct struct) {
         return new SourceRecord(
                 Collections.singletonMap("path", metadata.getPath()),
-                offsetMap,
+                new HashMap<String, Object>() {{
+                    put("offset", offset);
+                    put("file-size", metadata.getLen());
+                    put("eof", eof);
+                }},
                 config.getTopic(),
                 struct.schema(),
                 struct
diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTaskConfig.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTaskConfig.java
index 39278c3..f3b56ed 100644
--- a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTaskConfig.java
+++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTaskConfig.java
@@ -31,12 +31,17 @@ public class FsSourceTaskConfig extends FsSourceConnectorConfig {
     private static final String FILE_READER_CLASS_DOC = "File reader class to read files from the FS.";
     private static final String FILE_READER_CLASS_DISPLAY = "File reader class";
 
+    public static final String FILE_READER_BATCH_SIZE = FILE_READER_PREFIX + "batch_size";
+    private static final String FILE_READER_BATCH_SIZE_DOC = "Number of records to process at a time. Non-positive values disable batching.";
+    private static final String FILE_READER_BATCH_SIZE_DISPLAY = "Records per batch";
+
     public static final String POLL_INTERVAL_MS = "poll.interval.ms";
     private static final String POLL_INTERVAL_MS_DOC = "Frequency in ms to poll for new data.";
     public static final int POLL_INTERVAL_MS_DEFAULT = 10000;
     private static final String POLL_INTERVAL_MS_DISPLAY = "Poll Interval (ms)";
 
     private static final String POLICY_GROUP = "Policy";
+    private static final String FILE_READER_GROUP = "FileReader";
     private static final String CONNECTOR_GROUP = "Connector";
 
     public FsSourceTaskConfig(ConfigDef config, Map<String, String> parsedConfig) {
@@ -96,10 +101,20 @@ public static ConfigDef conf() {
                         ConfigDef.NO_DEFAULT_VALUE,
                         ConfigDef.Importance.HIGH,
                         FILE_READER_CLASS_DOC,
-                        POLICY_GROUP,
+                        FILE_READER_GROUP,
                         ++order,
                         ConfigDef.Width.MEDIUM,
                         FILE_READER_CLASS_DISPLAY
+                ).define(
+                        FILE_READER_BATCH_SIZE,
+                        ConfigDef.Type.INT,
+                        0,
+                        ConfigDef.Importance.MEDIUM,
+                        FILE_READER_BATCH_SIZE_DOC,
+                        FILE_READER_GROUP,
+                        ++order,
+                        ConfigDef.Width.MEDIUM,
+                        FILE_READER_BATCH_SIZE_DISPLAY
                 ).define(
                         POLL_INTERVAL_MS,
                         ConfigDef.Type.INT,
diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AbstractFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AbstractFileReader.java
index d63283f..e7ca7da 100644
--- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AbstractFileReader.java
+++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AbstractFileReader.java
@@ -1,5 +1,6 @@
 package com.github.mmolimar.kafka.connect.fs.file.reader;
 
+import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.kafka.connect.data.Struct;
@@ -21,6 +22,8 @@ public abstract class AbstractFileReader<T> implements FileReader {
     private final FileSystem fs;
     private final Path filePath;
     private final ReaderAdapter<T> adapter;
+    private final int batchSize;
+    private boolean seeked;
     private long offset;
 
     public AbstractFileReader(FileSystem fs, Path filePath, ReaderAdapter<T> adapter, Map<String, Object> config) {
@@ -30,6 +33,8 @@ public AbstractFileReader(FileSystem fs, Path filePath, ReaderAdapter<T> adapter
         this.fs = fs;
         this.filePath = filePath;
         this.adapter = adapter;
+        this.batchSize = Integer.parseInt(config.getOrDefault(FsSourceTaskConfig.FILE_READER_BATCH_SIZE, "0").toString());
+        this.seeked = false;
         this.offset = 0;
 
         configure(readerConfig(config));
@@ -54,13 +59,41 @@ public Path getFilePath() {
         return filePath;
     }
 
+    @Override
+    public long currentOffset() {
+        return offset;
+    }
+
+    protected void incrementOffset() {
+        offset++;
+    }
+
+    protected void setOffset(long offset) {
+        this.offset = offset;
+    }
+
+    @Override
+    public final boolean hasNext() {
+        checkClosed();
+        try {
+            return (batchSize <= 0 || offset == 0 || offset % batchSize != 0 || (offset % batchSize == 0 && seeked)) &&
+                    hasNextRecord();
+        } catch (ConnectException ce) {
+            throw ce;
+        } catch (Exception e) {
+            throw new ConnectException("Error when checking if the reader has more records.", e);
+        }
+    }
+
     @Override
     public final Struct next() {
         if (!hasNext()) {
             throw new NoSuchElementException("There are no more records in file: " + getFilePath());
         }
         try {
-            return adapter.apply(nextRecord());
+            Struct struct = adapter.apply(nextRecord());
+            seeked = false;
+            return struct;
         } catch (ConnectException ce) {
             throw ce;
         } catch (Exception e) {
@@ -68,17 +101,23 @@ public final Struct next() {
         }
     }
 
-    @Override
-    public long currentOffset() {
-        return offset;
-    }
-
-    protected void incrementOffset() {
-        this.offset++;
+    public final boolean hasNextBatch() {
+        checkClosed();
+        try {
+            return batchSize > 0 && hasNextRecord();
+        } catch (ConnectException ce) {
+            throw ce;
+        } catch (Exception e) {
+            throw new ConnectException("Error when checking if the reader has more batches.", e);
+        }
     }
 
-    protected void setOffset(long offset) {
-        this.offset = offset;
+    public final void nextBatch() {
+        if (!hasNextBatch()) {
+            throw new NoSuchElementException("There are no more batches in file: " + getFilePath());
+        }
+        long batchOffset = offset + (offset % batchSize);
+        seek(batchOffset);
     }
 
     @Override
@@ -89,23 +128,12 @@ public final void seek(long offset) {
         checkClosed();
         try {
             seekFile(offset);
+            seeked = true;
         } catch (IOException ioe) {
             throw new ConnectException("Error seeking file: " + getFilePath(), ioe);
         }
     }
 
-    @Override
-    public final boolean hasNext() {
-        checkClosed();
-        try {
-            return hasNextRecord();
-        } catch (ConnectException ce) {
-            throw ce;
-        } catch (Exception e) {
-            throw new ConnectException("Error when checking if the reader has more records.", e);
-        }
-    }
-
     protected ReaderAdapter<T> getAdapter() {
         return adapter;
     }
diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReader.java
index 3db8e3c..fee6324 100644
--- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReader.java
+++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReader.java
@@ -61,8 +61,15 @@ protected GenericRecord nextRecord() {
 
     @Override
     public void seekFile(long offset) throws IOException {
-        reader.sync(offset);
-        setOffset(reader.previousSync() - 16L);
+        if (offset == currentOffset()) {
+            return;
+        } else if (offset < currentOffset()) {
+            reader.sync(0L);
+        }
+        while (super.hasNext() && offset > currentOffset()) {
+            super.next();
+        }
+        setOffset(offset);
     }
 
     @Override
diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReader.java
index 3740db9..ac71056 100644
--- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReader.java
+++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReader.java
@@ -31,7 +31,6 @@ public class SequenceFileReader extends AbstractFileReader<SequenceFileReader.Se
     private final Writable key, value;
     private final Schema schema;
     private String keyFieldName, valueFieldName;
-    private long recordIndex, hasNextIndex;
     private boolean hasNext;
     private boolean closed;
 
@@ -47,7 +46,6 @@ public SequenceFileReader(FileSystem fs, Path filePath, Map<String, Object> conf
                 .field(keyFieldName, getSchema(this.key))
                 .field(valueFieldName, getSchema(this.value))
                 .build();
-        this.recordIndex = this.hasNextIndex = -1;
         this.hasNext = false;
         this.closed = false;
     }
@@ -82,9 +80,7 @@ Schema getSchema(Writable writable) {
     @Override
     public boolean hasNextRecord() throws IOException {
         try {
-            if (hasNextIndex == -1 || hasNextIndex == recordIndex) {
-                hasNextIndex++;
-                incrementOffset();
+            if (!hasNext) {
                 hasNext = reader.next(key, value);
             }
             return hasNext;
@@ -95,16 +91,24 @@ public boolean hasNextRecord() throws IOException {
 
     @Override
     protected SequenceRecord<Writable, Writable> nextRecord() {
-        recordIndex++;
+        incrementOffset();
+        hasNext = false;
         return new SequenceRecord<>(schema, keyFieldName, key, valueFieldName, value);
     }
 
     @Override
     public void seekFile(long offset) throws IOException {
-        reader.sync(offset);
-        hasNextIndex = recordIndex = offset;
-        hasNext = false;
-        setOffset(offset - 1);
+        if (offset == currentOffset()) {
+            return;
+        } else if (offset < currentOffset()) {
+            reader.sync(0L);
+            hasNext = false;
+        }
+        while (super.hasNext() && offset > currentOffset()) {
+            super.next();
+            hasNext = false;
+        }
+        setOffset(offset);
     }
 
     @Override
diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java
index 56f5581..3f27b35 100644
--- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java
+++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java
@@ -118,8 +118,7 @@ public void seekFile(long offset) throws IOException {
             reader.close();
             reader = new LineNumberReader(getFileReader(getFs().open(getFilePath())));
         }
-        while (reader.getLineNumber() < offset) {
-            reader.readLine();
+        while (reader.getLineNumber() < offset && reader.readLine() != null) {
         }
         setOffset(reader.getLineNumber());
     }
diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java
index 5ec4dba..722889f 100644
--- a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java
+++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java
@@ -220,8 +220,9 @@ current, new Path(metadata.getPath()), conf.originals()
                     .map(offset -> Long.parseLong(offset.toString()))
                     .filter(offset -> offset > 0)
                     .map(offset -> {
-                        long fileSizeBytes = Long.parseLong(offsetMap.getOrDefault("fileSizeBytes", "0").toString());
-                        if (metadata.getLen() == fileSizeBytes) {
+                        long fileSize = Long.parseLong(offsetMap.getOrDefault("file-size", "0").toString());
+                        boolean eof = Boolean.parseBoolean(offsetMap.getOrDefault("eof", "false").toString());
+                        if (metadata.getLen() == fileSize && eof) {
                             log.info("Skipping file [{}] due to it was already processed.", metadata.getPath());
                             return emptyFileReader(new Path(metadata.getPath()));
                         } else {
diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/SleepyPolicy.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/SleepyPolicy.java
index 2a02884..9eea1d3 100644
--- a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/SleepyPolicy.java
+++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/SleepyPolicy.java
@@ -33,28 +33,22 @@ protected void configPolicy(Map<String, Object> customConfigs) {
         try {
             this.sleep = Long.parseLong((String) customConfigs.get(SLEEPY_POLICY_SLEEP_MS));
         } catch (NumberFormatException nfe) {
-            throw new ConfigException(SLEEPY_POLICY_SLEEP_MS + " property is required and must be a number(long). Got: " +
+            throw new ConfigException(SLEEPY_POLICY_SLEEP_MS + " property is required and must be a number (long). Got: " +
                     customConfigs.get(SLEEPY_POLICY_SLEEP_MS));
         }
-        if (customConfigs.get(SLEEPY_POLICY_MAX_EXECS) != null) {
-            try {
-                this.maxExecs = Long.parseLong((String) customConfigs.get(SLEEPY_POLICY_MAX_EXECS));
-            } catch (NumberFormatException nfe) {
-                throw new ConfigException(SLEEPY_POLICY_MAX_EXECS + " property must be a number(long). Got: " +
-                        customConfigs.get(SLEEPY_POLICY_MAX_EXECS));
-            }
-        } else {
-            this.maxExecs = DEFAULT_MAX_EXECS;
+        try {
+            this.maxExecs = Long.parseLong((String) customConfigs.getOrDefault(SLEEPY_POLICY_MAX_EXECS,
+                    String.valueOf(DEFAULT_MAX_EXECS)));
+        } catch (NumberFormatException nfe) {
+            throw new ConfigException(SLEEPY_POLICY_MAX_EXECS + " property must be a number (long). Got: " +
+                    customConfigs.get(SLEEPY_POLICY_MAX_EXECS));
         }
-        if (customConfigs.get(SLEEPY_POLICY_SLEEP_FRACTION) != null) {
-            try {
-                this.sleepFraction = Long.parseLong((String) customConfigs.get(SLEEPY_POLICY_SLEEP_FRACTION));
-            } catch (NumberFormatException nfe) {
-                throw new ConfigException(SLEEPY_POLICY_SLEEP_FRACTION + " property must be a number(long). Got: " +
-                        customConfigs.get(SLEEPY_POLICY_SLEEP_FRACTION));
-            }
-        } else {
-            this.sleepFraction = DEFAULT_SLEEP_FRACTION;
+        try {
+            this.sleepFraction = Long.parseLong((String) customConfigs.getOrDefault(SLEEPY_POLICY_SLEEP_FRACTION,
+                    String.valueOf(DEFAULT_SLEEP_FRACTION)));
+        } catch (NumberFormatException nfe) {
+            throw new ConfigException(SLEEPY_POLICY_SLEEP_FRACTION + " property must be a number (long). Got: " +
+                    customConfigs.get(SLEEPY_POLICY_SLEEP_FRACTION));
         }
     }
 
diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReaderTest.java
index 261af40..8c04de7 100644
--- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReaderTest.java
+++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReaderTest.java
@@ -55,7 +55,7 @@ protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throw
                 datum.put(FIELD_NAME, String.format("%d_name_%s", index, UUID.randomUUID()));
                 datum.put(FIELD_SURNAME, String.format("%d_surname_%s", index, UUID.randomUUID()));
                 try {
-                    fsConfig.offsetsByIndex().put(index, dataFileWriter.sync() - 16L);
+                    fsConfig.offsetsByIndex().put(index, (long) index);
                     dataFileWriter.append(datum);
                 } catch (IOException ioe) {
                     throw new RuntimeException(ioe);
diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReaderTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReaderTestBase.java
index 0560c6c..7b6e725 100644
--- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReaderTestBase.java
+++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReaderTestBase.java
@@ -1,5 +1,6 @@
 package com.github.mmolimar.kafka.connect.fs.file.reader;
 
+import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig;
 import com.github.mmolimar.kafka.connect.fs.util.ReflectionUtils;
 import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream;
 import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream;
@@ -146,10 +147,33 @@ public void readAllData(ReaderFsTestConfig fsConfig) {
         assertEquals(NUM_RECORDS, recordCount, "The number of records in the file does not match");
     }
 
+    @ParameterizedTest
+    @MethodSource("fileSystemConfigProvider")
+    public void readAllDataInBatches(ReaderFsTestConfig fsConfig) {
+        Map<String, Object> config = getReaderConfig();
+        int batchSize = 5;
+        config.put(FsSourceTaskConfig.FILE_READER_BATCH_SIZE, batchSize);
+        AbstractFileReader<?> reader = (AbstractFileReader<?>) getReader(fsConfig.getFs(), fsConfig.getDataFile(), config);
+        assertTrue(reader.hasNext());
+
+        int recordCount = 0;
+        while (reader.hasNextBatch()) {
+            reader.nextBatch();
+            while (reader.hasNext()) {
+                Struct record = reader.next();
+                checkData(record, recordCount);
+                recordCount++;
+            }
+            assertEquals(0, recordCount % batchSize);
+        }
+        assertThrows(NoSuchElementException.class, reader::nextBatch);
+        assertEquals(NUM_RECORDS, recordCount, "The number of records in the file does not match");
+    }
+
     @ParameterizedTest
     @MethodSource("fileSystemConfigProvider")
     public void seekFile(ReaderFsTestConfig fsConfig) {
-        FileReader reader = fsConfig.getReader();
+        FileReader reader = getReader(fsConfig.getFs(), fsConfig.getDataFile(), getReaderConfig());
         int recordIndex = NUM_RECORDS / 2;
         reader.seek(fsConfig.offsetsByIndex().get(recordIndex));
         assertTrue(reader.hasNext());
diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReaderTest.java
index 2b23c99..0580ec2 100644
--- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReaderTest.java
+++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReaderTest.java
@@ -39,22 +39,12 @@ protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throw
                 try {
                     writer.append(key, value);
                     writer.sync();
+                    fsConfig.offsetsByIndex().put(index, (long) index);
                 } catch (IOException ioe) {
                     throw new RuntimeException(ioe);
                 }
             });
         }
-        try (SequenceFile.Reader reader = new SequenceFile.Reader(fs.getConf(),
-                SequenceFile.Reader.file(new Path(seqFile.getAbsolutePath())))) {
-            Writable key = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), fs.getConf());
-            Writable value = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), fs.getConf());
-            int index = 0;
-            long pos = reader.getPosition() - 1;
-            while (reader.next(key, value)) {
-                fsConfig.offsetsByIndex().put(index++, pos);
-                pos = reader.getPosition();
-            }
-        }
         Path path = new Path(new Path(fsConfig.getFsUri()), seqFile.getName());
         fs.moveFromLocalFile(new Path(seqFile.getAbsolutePath()), path);
         return path;
diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java
index 237a236..0614283 100644
--- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java
+++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java
@@ -252,14 +252,15 @@ public void execPolicyBatchesFiles(PolicyFsTestConfig fsConfig) throws IOExcepti
             String firstPath = metadata.getPath();
             FileReader reader = policy.offer(metadata, new HashMap<String, Object>() {{
                 put("offset", "1");
-                put("fileSizeBytes", "0");
+                put("file-size", "0");
             }});
             assertFalse(reader.hasNext());
             reader.seek(1000L);
+            assertThrows(NoSuchElementException.class, reader::next);
             reader.close();
             assertEquals(0L, reader.currentOffset());
             assertEquals(metadata.getPath(), reader.getFilePath().toString());
-            assertThrows(NoSuchElementException.class, reader::next);
+            assertThrows(ConnectException.class, reader::next);
             assertFalse(it.hasNext());
 
             // Second batch of files (1 file)
diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTest.java
index dd05ba8..d76f82f 100644
--- a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTest.java
+++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTest.java
@@ -4,6 +4,7 @@
 import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig;
 import com.github.mmolimar.kafka.connect.fs.file.reader.AvroFileReader;
 import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader;
+import com.github.mmolimar.kafka.connect.fs.policy.CronPolicy;
 import com.github.mmolimar.kafka.connect.fs.policy.Policy;
 import com.github.mmolimar.kafka.connect.fs.policy.SimplePolicy;
 import org.apache.hadoop.fs.FileSystem;
@@ -31,6 +32,7 @@
 import java.io.IOException;
 import java.io.PrintWriter;
 import java.time.Duration;
+import java.time.LocalDateTime;
 import java.util.*;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.stream.IntStream;
@@ -96,11 +98,12 @@ public void initTask() {
                 executionNumber.addAndGet(1);
 
                 Map<Map<String, Object>, Map<String, Object>> map = new HashMap<>();
-                captured.forEach(part ->{
+                captured.forEach(part -> {
                     if (((String) (part.get("path"))).endsWith(FILE_ALREADY_PROCESSED)) {
                         map.put(part, new HashMap<String, Object>() {{
                             put("offset", (long) NUM_RECORDS);
-                            put("fileSizeBytes", NUM_BYTES_PER_FILE);
+                            put("eof", true);
+                            put("file-size", NUM_BYTES_PER_FILE);
                         }});
                     } else {
                         map.put(part, new HashMap<String, Object>() {{
@@ -181,6 +184,77 @@ public void oneFilePerFs(TaskFsTestConfig fsConfig) throws IOException {
         assertNull(fsConfig.getTask().poll());
     }
 
+    @ParameterizedTest
+    @MethodSource("fileSystemConfigProvider")
+    public void oneFilePerFsWithFileReaderInBatch(TaskFsTestConfig fsConfig) throws IOException {
+        for (Path dir : fsConfig.getDirectories()) {
+            Path dataFile = new Path(dir, System.nanoTime() + ".txt");
+            createDataFile(fsConfig.getFs(), dataFile);
+            // this file does not match the regexp
+            fsConfig.getFs().createNewFile(new Path(dir, String.valueOf(System.nanoTime())));
+        }
+
+        int readerBatchSize = 8;
+        Map<String, String> taskConfig = fsConfig.getTaskConfig();
+        taskConfig.put(FsSourceTaskConfig.POLICY_CLASS, CronPolicy.class.getName());
+        taskConfig.put(CronPolicy.CRON_POLICY_EXPRESSION, "0/2 * * * * ?");
+        taskConfig.put(CronPolicy.CRON_POLICY_END_DATE, LocalDateTime.now().plusDays(1).toString());
+        taskConfig.put(FsSourceTaskConfig.FILE_READER_BATCH_SIZE, String.valueOf(readerBatchSize));
+        taskConfig.put(FsSourceTaskConfig.POLL_INTERVAL_MS, "1000");
+        fsConfig.getTask().start(taskConfig);
+
+        List<SourceRecord> records = fsConfig.getTask().poll();
+        assertEquals(((readerBatchSize % (NUM_RECORDS / 2)) * fsConfig.getDirectories().size()), records.size());
+        checkRecords(records);
+
+        records = fsConfig.getTask().poll();
+        assertEquals(((readerBatchSize % (NUM_RECORDS / 2)) * fsConfig.getDirectories().size()), records.size());
+        checkRecords(records);
+
+        // policy has ended
+        fsConfig.getTask().stop();
+        assertNull(fsConfig.getTask().poll());
+    }
+
+    @ParameterizedTest
+    @MethodSource("fileSystemConfigProvider")
+    public void oneFilePerFsWithBatchAndFileReaderInBatch(TaskFsTestConfig fsConfig) throws IOException {
+        for (Path dir : fsConfig.getDirectories()) {
+            Path dataFile = new Path(dir, System.nanoTime() + ".txt");
+            createDataFile(fsConfig.getFs(), dataFile);
+            // this file does not match the regexp
+            fsConfig.getFs().createNewFile(new Path(dir, String.valueOf(System.nanoTime())));
+        }
+
+        int readerBatchSize = 8;
+        Map<String, String> taskConfig = fsConfig.getTaskConfig();
+        taskConfig.put(FsSourceTaskConfig.POLICY_CLASS, CronPolicy.class.getName());
+        taskConfig.put(FsSourceTaskConfig.POLICY_BATCH_SIZE, "1");
+        taskConfig.put(CronPolicy.CRON_POLICY_EXPRESSION, "0/2 * * * * ?");
+        taskConfig.put(CronPolicy.CRON_POLICY_END_DATE, LocalDateTime.now().plusDays(1).toString());
+        taskConfig.put(FsSourceTaskConfig.FILE_READER_BATCH_SIZE, String.valueOf(readerBatchSize));
+        taskConfig.put(FsSourceTaskConfig.POLL_INTERVAL_MS, "1000");
+        fsConfig.getTask().start(taskConfig);
+
+        List<SourceRecord> firstBatch = fsConfig.getTask().poll();
+        assertEquals(readerBatchSize % (NUM_RECORDS / 2), firstBatch.size());
+        checkRecords(firstBatch);
+
+        List<SourceRecord> secondBatch = fsConfig.getTask().poll();
+        assertEquals(readerBatchSize % (NUM_RECORDS / 2), secondBatch.size());
+        checkRecords(secondBatch);
+        assertEquals(firstBatch.size(), secondBatch.size());
+        IntStream.range(0, firstBatch.size())
+                .forEach(index -> assertNotEquals(
+                        firstBatch.get(index).sourcePartition().get("path"),
+                        secondBatch.get(index).sourcePartition().get("path")
+                ));
+
+        // policy has ended
+        fsConfig.getTask().stop();
+        assertNull(fsConfig.getTask().poll());
+    }
+
     @ParameterizedTest
     @MethodSource("fileSystemConfigProvider")
     public void skipsFetchingFileIfByteOffsetExistsAndMatchesFileLength(TaskFsTestConfig fsConfig) throws IOException {
@@ -432,5 +506,4 @@ private File fillDataFile() throws IOException {
         }
         return txtFile;
     }
-
 }

From 6b5dd4ed59d6dac1d3e5fe7e412531a98aa364c0 Mon Sep 17 00:00:00 2001
From: Mario Molina <mmolimar@gmail.com>
Date: Sat, 4 Jul 2020 15:19:17 -0500
Subject: [PATCH 14/19] Logging improvements

---
 .../kafka/connect/fs/FsSourceConnector.java   | 11 +++-
 .../kafka/connect/fs/FsSourceTask.java        | 40 +++++++-----
 .../kafka/connect/fs/file/FileMetadata.java   | 16 ++++-
 .../fs/file/reader/AbstractFileReader.java    |  7 ++-
 .../fs/file/reader/JsonFileReader.java        |  3 +-
 .../connect/fs/policy/AbstractPolicy.java     | 13 +++-
 .../fs/policy/HdfsFileWatcherPolicy.java      | 62 ++++++++++++-------
 .../kafka/connect/fs/policy/SleepyPolicy.java |  2 +-
 .../kafka/connect/fs/util/Version.java        |  2 +-
 .../connect/fs/policy/PolicyTestBase.java     | 17 +++--
 src/test/resources/log4j.properties           |  2 +-
 11 files changed, 119 insertions(+), 56 deletions(-)

diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceConnector.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceConnector.java
index 9482e2a..3c7c115 100644
--- a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceConnector.java
+++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceConnector.java
@@ -28,7 +28,7 @@ public String version() {
 
     @Override
     public void start(Map<String, String> properties) {
-        log.info("Starting FsSourceConnector...");
+        log.info("{} Starting connector...", this);
         try {
             config = new FsSourceConnectorConfig(properties);
         } catch (ConfigException ce) {
@@ -59,14 +59,14 @@ public List<Map<String, String>> taskConfigs(int maxTasks) {
                     taskConfigs.add(taskProps);
                 });
 
-        log.debug("Partitions grouped as: {}", taskConfigs);
+        log.debug("{} Partitions grouped as: {}", this, taskConfigs);
 
         return taskConfigs;
     }
 
     @Override
     public void stop() {
-        log.info("Stopping FsSourceConnector.");
+        log.info("{} Stopping FsSourceConnector.", this);
         // Nothing to do
     }
 
@@ -74,4 +74,9 @@ public void stop() {
     public ConfigDef config() {
         return FsSourceConnectorConfig.conf();
     }
+
+    @Override
+    public String toString() {
+        return this.getClass().getSimpleName();
+    }
 }
diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java
index f150cba..bb74d5e 100644
--- a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java
+++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java
@@ -47,7 +47,7 @@ public String version() {
 
     @Override
     public void start(Map<String, String> properties) {
-        log.info("Starting FS source task...");
+        log.info("{} Starting FS source task...", this);
         try {
             config = new FsSourceTaskConfig(properties);
             if (config.getClass(FsSourceTaskConfig.POLICY_CLASS).isAssignableFrom(Policy.class)) {
@@ -63,19 +63,19 @@ public void start(Map<String, String> properties) {
             policy = ReflectionUtils.makePolicy(policyClass, config);
             pollInterval = config.getInt(FsSourceTaskConfig.POLL_INTERVAL_MS);
         } catch (ConfigException ce) {
-            log.error("Couldn't start FsSourceTask.", ce);
-            throw new ConnectException("Couldn't start FsSourceTask due to configuration error: " + ce.getMessage(), ce);
+            log.error("{} Couldn't start FS source task: {}", this, ce.getMessage(), ce);
+            throw new ConnectException("Couldn't start FS source task due to configuration error: " + ce.getMessage(), ce);
         } catch (Exception e) {
-            log.error("Couldn't start FsSourceConnector.", e);
+            log.error("{} Couldn't start FS source task: {}", this, e.getMessage(), e);
             throw new ConnectException("A problem has occurred reading configuration: " + e.getMessage(), e);
         }
-        log.info("FS source task started with policy [{}].", policy.getClass().getName());
+        log.info("{} FS source task started with policy [{}].", this, policy.getClass().getName());
     }
 
     @Override
     public List<SourceRecord> poll() {
         while (!stop.get() && policy != null && !policy.hasEnded()) {
-            log.trace("Polling for new data...");
+            log.trace("{} Polling for new data...", this);
             Function<FileMetadata, Map<String, Object>> makePartitionKey = (FileMetadata metadata) ->
                     Collections.singletonMap("path", metadata.getPath());
 
@@ -89,7 +89,7 @@ public List<SourceRecord> poll() {
                 Map<String, Object> partitionKey = makePartitionKey.apply(metadata);
                 Map<String, Object> offset = Optional.ofNullable(offsets.get(partitionKey)).orElse(new HashMap<>());
                 try (FileReader reader = policy.offer(metadata, offset)) {
-                    log.info("Processing records for file {}.", metadata);
+                    log.info("{} Processing records for file {}...", this, metadata);
                     while (reader.hasNext()) {
                         Struct record = reader.next();
                         // TODO change FileReader interface in the next major version
@@ -99,20 +99,21 @@ public List<SourceRecord> poll() {
                     }
                 } catch (ConnectException | IOException e) {
                     // when an exception happens reading a file, the connector continues
-                    log.error("Error reading file [{}]. Keep going...", metadata.getPath(), e);
+                    log.warn("{} Error reading file [{}]: {}. Keep going...",
+                            this, metadata.getPath(), e.getMessage(), e);
                 }
-                log.debug("Read [{}] records from file [{}].", records.size(), metadata.getPath());
+                log.debug("{} Read [{}] records from file [{}].", this, records.size(), metadata.getPath());
 
                 return records;
             }).flatMap(Collection::stream).collect(Collectors.toList());
 
-            log.debug("Returning [{}] records in execution number [{}] for policy [{}].",
-                    totalRecords.size(), policy.getExecutions(), policy.getClass().getName());
+            log.debug("{} Returning [{}] records in execution number [{}] for policy [{}].",
+                    this, totalRecords.size(), policy.getExecutions(), policy.getClass().getName());
 
             return totalRecords;
         }
         if (pollInterval > 0) {
-            log.trace("Waiting [{}] ms for next poll.", pollInterval);
+            log.trace("{} Waiting [{}] ms for next poll.", this, pollInterval);
             time.sleep(pollInterval);
         }
         return null;
@@ -124,9 +125,9 @@ private Stream<FileMetadata> filesToProcess() {
                     .filter(metadata -> metadata.getLen() > 0);
         } catch (IOException | ConnectException e) {
             // when an exception happens executing the policy, the connector continues
-            log.error("Cannot retrieve files to process from the FS: {}. " +
-                            "There was an error executing the policy but the task tolerates this and continues.",
-                    policy.getURIs(), e);
+            log.error("{} Cannot retrieve files to process from the FS: [{}]. " +
+                            "There was an error executing the policy but the task tolerates this and continues: {}",
+                    this, policy.getURIs(), e.getMessage(), e);
             return Stream.empty();
         }
     }
@@ -152,16 +153,21 @@ private SourceRecord convert(FileMetadata metadata, long offset, boolean eof, St
 
     @Override
     public void stop() {
-        log.info("Stopping FS source task...");
+        log.info("{} Stopping FS source task...", this);
         stop.set(true);
         synchronized (this) {
             if (policy != null) {
                 try {
                     policy.close();
                 } catch (IOException ioe) {
-                    log.warn("Error closing policy [{}].", policy.getClass().getName(), ioe);
+                    log.warn("{} Error closing policy: {}", this, ioe.getMessage(), ioe);
                 }
             }
         }
     }
+
+    @Override
+    public String toString() {
+        return this.getClass().getSimpleName();
+    }
 }
diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/FileMetadata.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/FileMetadata.java
index 669b681..6ea8996 100644
--- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/FileMetadata.java
+++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/FileMetadata.java
@@ -1,6 +1,7 @@
 package com.github.mmolimar.kafka.connect.fs.file;
 
 import java.util.List;
+import java.util.Optional;
 
 public class FileMetadata {
     private String path;
@@ -8,7 +9,7 @@ public class FileMetadata {
     private List<BlockInfo> blocks;
 
     public FileMetadata(String path, long length, List<BlockInfo> blocks) {
-        this.path = path;
+        this.path = Optional.ofNullable(path).orElse("");
         this.length = length;
         this.blocks = blocks;
     }
@@ -42,7 +43,7 @@ public boolean equals(Object object) {
     }
 
     public int hashCode() {
-        return path == null ? 0 : path.hashCode();
+        return path.hashCode();
     }
 
 
@@ -57,6 +58,17 @@ public BlockInfo(long offset, long length, boolean corrupt) {
             this.corrupt = corrupt;
         }
 
+        @Override
+        public boolean equals(Object object) {
+            if (this == object) return true;
+            if (!(object instanceof BlockInfo)) return false;
+
+            BlockInfo blockInfo = (BlockInfo) object;
+            return this.offset == blockInfo.offset &&
+                    this.length == blockInfo.length &&
+                    this.corrupt == blockInfo.corrupt;
+        }
+
         @Override
         public String toString() {
             return String.format("[offset = %s, length = %s, corrupt = %s]", offset, length, corrupt);
diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AbstractFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AbstractFileReader.java
index e7ca7da..c698913 100644
--- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AbstractFileReader.java
+++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AbstractFileReader.java
@@ -38,7 +38,7 @@ public AbstractFileReader(FileSystem fs, Path filePath, ReaderAdapter<T> adapter
         this.offset = 0;
 
         configure(readerConfig(config));
-        log.trace("Initialized file reader [{}] for file [{}].", getClass().getName(), filePath);
+        log.trace("{} Initialized file reader with batch size [{}] for file [{}].", this, this.batchSize, this.filePath);
     }
 
     protected final Map<String, String> readerConfig(Map<String, Object> config) {
@@ -134,6 +134,11 @@ public final void seek(long offset) {
         }
     }
 
+    @Override
+    public String toString() {
+        return this.getClass().getSimpleName();
+    }
+
     protected ReaderAdapter<T> getAdapter() {
         return adapter;
     }
diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReader.java
index c89c741..f1440b0 100644
--- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReader.java
+++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReader.java
@@ -68,7 +68,8 @@ protected void configure(Map<String, String> config) {
                         mapper.configure(DeserializationFeature.valueOf(feature),
                                 Boolean.parseBoolean(entry.getValue()));
                     } else {
-                        log.warn("Ignoring deserialization configuration '{}' due to it does not exist.", feature);
+                        log.warn("{} Ignoring deserialization configuration [{}] due to it does not exist.",
+                                this, feature);
                     }
                 });
     }
diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java
index 722889f..e951977 100644
--- a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java
+++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java
@@ -56,6 +56,8 @@ public AbstractPolicy(FsSourceTaskConfig conf) throws IOException {
         logAll(customConfigs);
         configFs(customConfigs);
         configPolicy(customConfigs);
+
+        log.info("{} Initialized policy with batch size [{}].", this, batchSize);
     }
 
     private Map<String, Object> customConfigs() {
@@ -196,7 +198,6 @@ public final long getExecutions() {
     }
 
     FileMetadata toMetadata(LocatedFileStatus fileStatus) {
-
         List<FileMetadata.BlockInfo> blocks = Arrays.stream(fileStatus.getBlockLocations())
                 .map(block -> new FileMetadata.BlockInfo(block.getOffset(), block.getLength(), block.isCorrupt()))
                 .collect(Collectors.toList());
@@ -223,10 +224,11 @@ current, new Path(metadata.getPath()), conf.originals()
                         long fileSize = Long.parseLong(offsetMap.getOrDefault("file-size", "0").toString());
                         boolean eof = Boolean.parseBoolean(offsetMap.getOrDefault("eof", "false").toString());
                         if (metadata.getLen() == fileSize && eof) {
-                            log.info("Skipping file [{}] due to it was already processed.", metadata.getPath());
+                            log.info("{} Skipping file [{}] due to it was already processed.", this, metadata.getPath());
                             return emptyFileReader(new Path(metadata.getPath()));
                         } else {
-                            log.info("Seeking to offset [{}] for file [{}].", offsetMap.get("offset"), metadata.getPath());
+                            log.info("{} Seeking to offset [{}] for file [{}].",
+                                    this, offsetMap.get("offset"), metadata.getPath());
                             FileReader reader = makeReader.get();
                             reader.seek(offset);
                             return reader;
@@ -244,6 +246,11 @@ public void close() throws IOException {
         }
     }
 
+    @Override
+    public String toString() {
+        return this.getClass().getSimpleName();
+    }
+
     private Iterator<FileMetadata> concat(final Iterator<FileMetadata> it1, final Iterator<FileMetadata> it2) {
         return new Iterator<FileMetadata>() {
 
diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/HdfsFileWatcherPolicy.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/HdfsFileWatcherPolicy.java
index 8d2f0d6..b89923c 100644
--- a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/HdfsFileWatcherPolicy.java
+++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/HdfsFileWatcherPolicy.java
@@ -64,7 +64,7 @@ protected void configPolicy(Map<String, Object> customConfigs) {
                     "number (long). Got: " + customConfigs.get(HDFS_FILE_WATCHER_POLICY_RETRY_MS));
         }
         this.fsEvenStream = new HashMap<>();
-        fileSystems.stream()
+        this.fileSystems.stream()
                 .filter(fs -> fs.getWorkingDirectory().toString().startsWith(URI_PREFIX))
                 .forEach(fs -> {
                     try {
@@ -89,7 +89,9 @@ public Iterator<FileMetadata> listFiles(FileSystem fs) {
         Set<FileMetadata> files = new HashSet<>();
         FileMetadata metadata;
         while ((metadata = fileQueue.poll()) != null) {
-            files.add(metadata);
+            FileMetadata fm = metadata;
+            files.removeIf(f -> f.getPath().equals(fm.getPath()));
+            files.add(fm);
         }
         return files.iterator();
     }
@@ -116,6 +118,11 @@ public void close() throws IOException {
         super.close();
     }
 
+    @Override
+    public String toString() {
+        return this.getClass().getSimpleName();
+    }
+
     private class EventStreamThread extends Thread {
         private final FileSystem fs;
         private final HdfsAdmin admin;
@@ -139,62 +146,75 @@ public void run() {
                         EventBatch batch = eventStream.poll();
                         if (batch == null) continue;
 
+                        Set<String> files = new HashSet<>();
                         for (Event event : batch.getEvents()) {
                             switch (event.getEventType()) {
                                 case CREATE:
                                     if (!((Event.CreateEvent) event).getPath().endsWith("._COPYING_")) {
-                                        enqueue(((Event.CreateEvent) event).getPath());
+                                        files.add(((Event.CreateEvent) event).getPath());
                                     }
                                     break;
                                 case APPEND:
                                     if (!((Event.AppendEvent) event).getPath().endsWith("._COPYING_")) {
-                                        enqueue(((Event.AppendEvent) event).getPath());
+                                        files.add(((Event.AppendEvent) event).getPath());
                                     }
                                     break;
                                 case RENAME:
                                     if (((Event.RenameEvent) event).getSrcPath().endsWith("._COPYING_")) {
-                                        enqueue(((Event.RenameEvent) event).getDstPath());
+                                        files.add(((Event.RenameEvent) event).getDstPath());
                                     }
                                     break;
                                 case CLOSE:
                                     if (!((Event.CloseEvent) event).getPath().endsWith("._COPYING_")) {
-                                        enqueue(((Event.CloseEvent) event).getPath());
+                                        files.add(((Event.CloseEvent) event).getPath());
                                     }
                                     break;
                                 default:
                                     break;
                             }
                         }
+                        enqueue(files);
                     }
                 } catch (IOException ioe) {
                     if (retrySleepMs > 0) {
                         time.sleep(retrySleepMs);
                     } else {
-                        log.warn("Error watching path [{}]. Stopping it...", fs.getWorkingDirectory(), ioe);
+                        log.warn("{} Error watching path [{}]: {}. Stopping it...",
+                                this, fs.getWorkingDirectory(), ioe.getMessage(), ioe);
                         throw new IllegalWorkerStateException(ioe);
                     }
                 } catch (Exception e) {
-                    log.warn("Stopping watcher due to an unexpected exception when watching path [{}].",
-                            fs.getWorkingDirectory(), e);
+                    log.warn("{} Stopping watcher due to an unexpected exception when watching path [{}]: {}",
+                            this, fs.getWorkingDirectory(), e.getMessage(), e);
                     throw new IllegalWorkerStateException(e);
                 }
             }
         }
 
-        private void enqueue(String path) throws IOException {
-            Path filePath = new Path(path);
-            if (!fs.exists(filePath) || fs.getFileStatus(filePath) == null) {
-                log.info("Cannot enqueue file [{}] because it does not exist but got an event from the FS", filePath);
-                return;
-            }
+        private void enqueue(Set<String> paths) throws IOException {
+            for (String path : paths) {
+                Path filePath = new Path(path);
+                if (!fs.exists(filePath) || fs.getFileStatus(filePath) == null) {
+                    log.info("{} Cannot enqueue file [{}] because it does not exist but got an event from the FS",
+                            this, filePath);
+                    return;
+                }
 
-            log.debug("Enqueuing file to process [{}]", filePath);
-            RemoteIterator<LocatedFileStatus> it = fs.listFiles(filePath, false);
-            while (it.hasNext()) {
-                LocatedFileStatus status = it.next();
-                if (!status.isFile() || !fileRegexp.matcher(status.getPath().getName()).find()) continue;
-                fileQueue.offer(toMetadata(status));
+                RemoteIterator<LocatedFileStatus> it = fs.listFiles(filePath, false);
+                while (it.hasNext()) {
+                    LocatedFileStatus status = it.next();
+                    if (status.isFile() && fileRegexp.matcher(status.getPath().getName()).find()) {
+                        FileMetadata metadata = toMetadata(status);
+                        log.debug("{} Enqueuing file to process [{}].", this, metadata.getPath());
+                        fileQueue.offer(metadata);
+                    }
+                }
             }
         }
+
+        @Override
+        public String toString() {
+            return this.getClass().getSimpleName();
+        }
     }
 }
diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/SleepyPolicy.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/SleepyPolicy.java
index 9eea1d3..32c6e12 100644
--- a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/SleepyPolicy.java
+++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/SleepyPolicy.java
@@ -65,7 +65,7 @@ private void sleepIfApply() {
                     Thread.sleep(sleep / sleepFraction);
                     counter++;
                 } catch (InterruptedException ie) {
-                    log.warn("An interrupted exception has occurred.", ie);
+                    log.warn("{} An interrupted exception has occurred when sleeping: {}", this, ie.getMessage(), ie);
                 }
             }
         }
diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/util/Version.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/util/Version.java
index 7e94e04..1807428 100644
--- a/src/main/java/com/github/mmolimar/kafka/connect/fs/util/Version.java
+++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/util/Version.java
@@ -15,7 +15,7 @@ public class Version {
             props.load(Version.class.getResourceAsStream("/kafka-connect-fs-version.properties"));
             version = props.getProperty("version", version).trim();
         } catch (Exception e) {
-            log.warn("Error while loading version:", e);
+            log.warn("Error while loading version: {}", e.getMessage(), e);
         }
     }
 
diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java
index 0614283..c511311 100644
--- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java
+++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java
@@ -17,8 +17,7 @@
 import org.junit.jupiter.params.provider.Arguments;
 import org.junit.jupiter.params.provider.MethodSource;
 
-import java.io.FileNotFoundException;
-import java.io.IOException;
+import java.io.*;
 import java.time.LocalDateTime;
 import java.time.format.DateTimeFormatter;
 import java.util.*;
@@ -236,7 +235,12 @@ public void execPolicyBatchesFiles(PolicyFsTestConfig fsConfig) throws IOExcepti
 
             FileSystem fs = fsConfig.getFs();
             for (Path dir : fsConfig.getDirectories()) {
-                fs.createNewFile(new Path(dir, System.nanoTime() + ".txt"));
+                File tmp = File.createTempFile("test-", ".txt");
+                try (PrintWriter writer = new PrintWriter(new FileOutputStream(tmp))) {
+                    writer.append("test");
+                }
+                fs.moveFromLocalFile(new Path(tmp.getAbsolutePath()), new Path(dir, System.nanoTime() + ".txt"));
+
                 // this file does not match the regexp
                 fs.createNewFile(new Path(dir, System.nanoTime() + ".invalid"));
 
@@ -249,10 +253,14 @@ public void execPolicyBatchesFiles(PolicyFsTestConfig fsConfig) throws IOExcepti
             // First batch of files (1 file)
             assertTrue(it.hasNext());
             FileMetadata metadata = it.next();
+            FileMetadata metadataFromFs = ((AbstractPolicy) policy)
+                    .toMetadata(fs.listLocatedStatus(new Path(metadata.getPath())).next());
+            assertEquals(metadata, metadataFromFs);
             String firstPath = metadata.getPath();
             FileReader reader = policy.offer(metadata, new HashMap<String, Object>() {{
                 put("offset", "1");
-                put("file-size", "0");
+                put("file-size", "4");
+                put("eof", "true");
             }});
             assertFalse(reader.hasNext());
             reader.seek(1000L);
@@ -260,7 +268,6 @@ public void execPolicyBatchesFiles(PolicyFsTestConfig fsConfig) throws IOExcepti
             reader.close();
             assertEquals(0L, reader.currentOffset());
             assertEquals(metadata.getPath(), reader.getFilePath().toString());
-            assertThrows(ConnectException.class, reader::next);
             assertFalse(it.hasNext());
 
             // Second batch of files (1 file)
diff --git a/src/test/resources/log4j.properties b/src/test/resources/log4j.properties
index b926aa8..c39cf11 100644
--- a/src/test/resources/log4j.properties
+++ b/src/test/resources/log4j.properties
@@ -5,7 +5,7 @@ log4j.rootLogger=INFO, stdout
 log4j.appender.stdout=org.apache.log4j.ConsoleAppender
 log4j.appender.stdout.Target=System.out
 log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
-log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c:%L - %m%n
+log4j.appender.stdout.layout.ConversionPattern=[%d] %p %m (%c)%n
 
 log4j.logger.com.github.mmolimar.kafka.connect.fs=TRACE
 log4j.logger.org.apache.hadoop=ERROR

From 4e951acd8ce940cf13fc12fd2c57a20f9e8a8734 Mon Sep 17 00:00:00 2001
From: Mario Molina <mmolimar@gmail.com>
Date: Sat, 4 Jul 2020 15:40:35 -0500
Subject: [PATCH 15/19] Fix in policy tests

---
 .../mmolimar/kafka/connect/fs/policy/PolicyTestBase.java      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java
index c511311..9fbb429 100644
--- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java
+++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java
@@ -154,7 +154,7 @@ public void recursiveDirectory(PolicyFsTestConfig fsConfig) throws IOException,
             fs.createNewFile(new Path(tmpDir, System.nanoTime() + ".invalid"));
 
             // we wait till FS has registered the files
-            Thread.sleep(3000);
+            Thread.sleep(5000);
         }
         Iterator<FileMetadata> it = fsConfig.getPolicy().execute();
         assertTrue(it.hasNext());
@@ -245,7 +245,7 @@ public void execPolicyBatchesFiles(PolicyFsTestConfig fsConfig) throws IOExcepti
                 fs.createNewFile(new Path(dir, System.nanoTime() + ".invalid"));
 
                 // we wait till FS has registered the files
-                Thread.sleep(3000);
+                Thread.sleep(5000);
             }
 
             Iterator<FileMetadata> it = policy.execute();

From f137ea109acfaacf17ef02bc26198d23e1f7cff4 Mon Sep 17 00:00:00 2001
From: Mario Molina <mmolimar@gmail.com>
Date: Sun, 5 Jul 2020 10:09:19 -0500
Subject: [PATCH 16/19] SFTP support

---
 pom.xml                                                   | 8 +++++++-
 .../mmolimar/kafka/connect/fs/policy/AbstractPolicy.java  | 7 +++++--
 .../mmolimar/kafka/connect/fs/util/ReflectionUtils.java   | 2 +-
 3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/pom.xml b/pom.xml
index a3e1c0c..a51e700 100644
--- a/pom.xml
+++ b/pom.xml
@@ -19,6 +19,7 @@
         <orc.version>1.6.3</orc.version>
         <univocity.version>2.8.4</univocity.version>
         <cron-utils.version>9.0.2</cron-utils.version>
+        <jsch.version>0.1.54</jsch.version>
         <junit-jupiter.version>5.6.2</junit-jupiter.version>
         <easymock.version>4.2</easymock.version>
         <powermock.version>2.0.7</powermock.version>
@@ -90,6 +91,11 @@
             <artifactId>cron-utils</artifactId>
             <version>${cron-utils.version}</version>
         </dependency>
+        <dependency>
+            <groupId>com.jcraft</groupId>
+            <artifactId>jsch</artifactId>
+            <version>${jsch.version}</version>
+        </dependency>
 
         <!-- test dependencies -->
         <dependency>
@@ -260,7 +266,7 @@
                 <enabled>true</enabled>
             </releases>
             <snapshots>
-                <enabled>true</enabled>
+                <enabled>false</enabled>
             </snapshots>
             <url>http://packages.confluent.io/maven/</url>
         </repository>
diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java
index e951977..9d23c04 100644
--- a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java
+++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java
@@ -69,6 +69,7 @@ private Map<String, Object> customConfigs() {
     private void configFs(Map<String, Object> customConfigs) throws IOException {
         for (String uri : this.conf.getFsUris()) {
             Configuration fsConfig = new Configuration();
+            fsConfig.set("fs.sftp.impl", "org.apache.hadoop.fs.sftp.SFTPFileSystem");
             customConfigs.entrySet().stream()
                     .filter(entry -> entry.getKey().startsWith(FsSourceTaskConfig.POLICY_PREFIX_FS))
                     .forEach(entry -> fsConfig.set(entry.getKey().replace(FsSourceTaskConfig.POLICY_PREFIX_FS, ""),
@@ -104,7 +105,9 @@ private String convert(String uri) {
     @Override
     public List<String> getURIs() {
         List<String> uris = new ArrayList<>();
-        fileSystems.forEach(fs -> uris.add(fs.getWorkingDirectory().toString()));
+        fileSystems.forEach(fs ->
+                uris.add(Optional.ofNullable(fs.getWorkingDirectory()).orElse(new Path("./")).toString())
+        );
         return uris;
     }
 
@@ -210,7 +213,7 @@ public FileReader offer(FileMetadata metadata, Map<String, Object> offsetMap) {
         FileSystem current = fileSystems.stream()
                 .filter(fs -> metadata.getPath().startsWith(fs.getWorkingDirectory().toString()))
                 .findFirst()
-                .orElse(null);
+                .orElse(fileSystems.stream().findFirst().orElseThrow(() -> new ConnectException(("Invalid FS."))));
 
         Supplier<FileReader> makeReader = () -> ReflectionUtils.makeReader(
                 (Class<? extends FileReader>) conf.getClass(FsSourceTaskConfig.FILE_READER_CLASS),
diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/util/ReflectionUtils.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/util/ReflectionUtils.java
index 04fa75c..9473837 100644
--- a/src/main/java/com/github/mmolimar/kafka/connect/fs/util/ReflectionUtils.java
+++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/util/ReflectionUtils.java
@@ -27,8 +27,8 @@ public static Policy makePolicy(Class<? extends Policy> clazz, FsSourceTaskConfi
     private static <T> T make(Class<T> clazz, Object... args) {
         try {
             Class[] constClasses = Arrays.stream(args).map(Object::getClass).toArray(Class[]::new);
-
             Constructor<T> constructor = ConstructorUtils.getMatchingAccessibleConstructor(clazz, constClasses);
+
             return constructor.newInstance(args);
         } catch (IllegalAccessException |
                 InstantiationException |

From 8a65ec4df66a29f91343ce8033eba986947ae6a7 Mon Sep 17 00:00:00 2001
From: Mario Molina <mmolimar@gmail.com>
Date: Sun, 5 Jul 2020 10:25:37 -0500
Subject: [PATCH 17/19] Upgrade dependencies

---
 pom.xml | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 53 insertions(+), 7 deletions(-)

diff --git a/pom.xml b/pom.xml
index a51e700..68fad3b 100644
--- a/pom.xml
+++ b/pom.xml
@@ -8,13 +8,48 @@
     <packaging>jar</packaging>
 
     <name>kafka-connect-fs</name>
+    <description>
+        Kafka Connect FileSystem Connector is a source connector for reading records from different
+        sort of file formats and from different file system types and load them into Kafka.
+    </description>
+    <url>https://github.com/mmolimar/kafka-connect-fs</url>
+
+    <licenses>
+        <license>
+            <name>Apache License 2.0</name>
+            <url>https://github.com/mmolimar/kafka-connect-fs/blob/master/LICENSE</url>
+            <distribution>repo</distribution>
+        </license>
+    </licenses>
+
+    <developers>
+        <developer>
+            <name>Mario Molina</name>
+            <url>https://github.com/mmolimar</url>
+            <roles>
+                <role>Committer</role>
+            </roles>
+        </developer>
+    </developers>
+
+    <scm>
+        <connection>scm:git:https://github.com/mmolimar/kafka-connect-fs.git</connection>
+        <developerConnection>scm:git:git@github.com:mmolimar/kafka-connect-fs.git</developerConnection>
+        <url>https://github.com/mmolimar/kafka-connect-fs</url>
+        <tag>HEAD</tag>
+    </scm>
+
+    <issueManagement>
+        <system>github</system>
+        <url>https://github.com/mmolimar/kafka-connect-fs/issues</url>
+    </issueManagement>
 
     <properties>
         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
         <kafka.version>2.5.0</kafka.version>
         <confluent.version>5.5.0</confluent.version>
         <hadoop.version>3.2.1</hadoop.version>
-        <gcs-connector.version>hadoop3-2.1.2</gcs-connector.version>
+        <gcs-connector.version>hadoop3-2.1.3</gcs-connector.version>
         <parquet.version>1.11.0</parquet.version>
         <orc.version>1.6.3</orc.version>
         <univocity.version>2.8.4</univocity.version>
@@ -27,10 +62,10 @@
         <maven-compiler.target>${maven-compiler.source}</maven-compiler.target>
         <maven-jar-plugin.version>3.2.0</maven-jar-plugin.version>
         <maven-compiler-plugin.version>3.8.1</maven-compiler-plugin.version>
-        <maven-assembly-plugin.version>3.2.0</maven-assembly-plugin.version>
+        <maven-assembly-plugin.version>3.3.0</maven-assembly-plugin.version>
         <maven-jacoco-plugin.version>0.8.5</maven-jacoco-plugin.version>
         <maven-coveralls-plugin.version>4.3.0</maven-coveralls-plugin.version>
-        <maven-surfire-plugin.version>3.0.0-M4</maven-surfire-plugin.version>
+        <maven-surfire-plugin.version>3.0.0-M5</maven-surfire-plugin.version>
         <maven-kafka-connect-plugin.version>0.11.3</maven-kafka-connect-plugin.version>
     </properties>
 
@@ -204,25 +239,32 @@
                         <configuration>
                             <name>kafka-connect-fs</name>
                             <title>Kafka Connect FileSystem</title>
-                            <documentationUrl>https://kafka-connect-fs.readthedocs.io/</documentationUrl>
+                            <documentationUrl>https://kafka-connect-fs.readthedocs.io</documentationUrl>
                             <sourceUrl>https://github.com/mmolimar/kafka-connect-fs</sourceUrl>
                             <description>
-                                Kafka Connect FileSystem Connector is a source connector for reading records from files
-                                in the file systems specified and load them into Kafka.
+                                Kafka Connect FileSystem Connector is a source connector for reading records from
+                                different sort of file formats and from different file system types and load them
+                                into Kafka.
                             </description>
+                            <sourceUrl>https://github.com/mmolimar/kafka-connect-fs</sourceUrl>
+
                             <supportProviderName>Mario Molina</supportProviderName>
                             <supportSummary>This connector is supported by the open source community.</supportSummary>
                             <supportUrl>https://github.com/mmolimar/kafka-connect-fs/issues</supportUrl>
+
                             <ownerUsername>mmolimar</ownerUsername>
                             <ownerType>user</ownerType>
                             <ownerName>Mario Molina</ownerName>
                             <ownerUrl>https://github.com/mmolimar</ownerUrl>
+
                             <dockerNamespace>mmolimar</dockerNamespace>
                             <dockerName>kafka-connect-fs</dockerName>
                             <dockerTag>${project.version}</dockerTag>
+
                             <componentTypes>
                                 <componentType>source</componentType>
                             </componentTypes>
+
                             <tags>
                                 <tag>filesystem</tag>
                                 <tag>files</tag>
@@ -233,18 +275,22 @@
                                 <tag>google</tag>
                                 <tag>gcs</tag>
                                 <tag>azure</tag>
+                                <tag>sftp</tag>
+                                <tag>ftp</tag>
                                 <tag>txt</tag>
                                 <tag>csv</tag>
                                 <tag>tsv</tag>
                                 <tag>json</tag>
                                 <tag>avro</tag>
                                 <tag>parquet</tag>
+                                <tag>orc</tag>
                                 <tag>sequence</tag>
                             </tags>
-                            <requirements/>
+
                             <deliveryGuarantee>
                                 <deliveryGuarantee>atLeastOnce</deliveryGuarantee>
                             </deliveryGuarantee>
+
                             <confluentControlCenterIntegration>true</confluentControlCenterIntegration>
                         </configuration>
                     </execution>

From 44288dc80f6d3ea4924bc7b768964b391c449cda Mon Sep 17 00:00:00 2001
From: Mario Molina <mmolimar@gmail.com>
Date: Sun, 5 Jul 2020 15:44:05 -0500
Subject: [PATCH 18/19] Updating documentation

---
 config/kafka-connect-fs.properties | 2 ++
 docs/source/config_options.rst     | 7 +++++++
 docs/source/connector.rst          | 6 +++++-
 docs/source/faq.rst                | 3 +++
 4 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/config/kafka-connect-fs.properties b/config/kafka-connect-fs.properties
index aab1ae6..27b9eb9 100644
--- a/config/kafka-connect-fs.properties
+++ b/config/kafka-connect-fs.properties
@@ -6,4 +6,6 @@ topic=mytopic
 policy.class=com.github.mmolimar.kafka.connect.fs.policy.SimplePolicy
 policy.recursive=true
 policy.regexp=^.*\.txt$
+policy.batch_size=0
 file_reader.class=com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader
+file_reader.batch_size=0
diff --git a/docs/source/config_options.rst b/docs/source/config_options.rst
index b160dda..0abb591 100644
--- a/docs/source/config_options.rst
+++ b/docs/source/config_options.rst
@@ -117,6 +117,13 @@ General config properties for this connector.
   * Type: string
   * Importance: high
 
+``file_reader.batch_size``
+  Number of records to process at a time. Non-positive values disable batching.
+
+  * Type: int
+  * Default: ``0``
+  * Importance: medium
+
 ``file_reader.<file_reader_name>.<file_reader_property>``
   This represents custom properties you can include based on the file reader class specified.
 
diff --git a/docs/source/connector.rst b/docs/source/connector.rst
index 476aa7b..948dcb7 100644
--- a/docs/source/connector.rst
+++ b/docs/source/connector.rst
@@ -15,7 +15,7 @@ Among others, these are some file systems it supports:
 * S3.
 * Google Cloud Storage.
 * Azure Blob Storage & Azure Data Lake Store.
-* FTP.
+* FTP & SFTP.
 * WebHDFS.
 * Local File System.
 * Hadoop Archive File System.
@@ -52,7 +52,9 @@ The ``kafka-connect-fs.properties`` file defines the following properties as req
    policy.class=<Policy class>
    policy.recursive=true
    policy.regexp=.*
+   policy.batch_size=0
    file_reader.class=<File reader class>
+   file_reader.batch_size=0
 
 #. The connector name.
 #. Class indicating the connector.
@@ -65,8 +67,10 @@ The ``kafka-connect-fs.properties`` file defines the following properties as req
    ``com.github.mmolimar.kafka.connect.fs.policy.Policy`` interface).
 #. Flag to activate traversed recursion in subdirectories when listing files.
 #. Regular expression to filter files from the FS.
+#. Number of files that should be handled at a time. Non-positive values disable batching.
 #. File reader class to read files from the FS
    (must implement ``com.github.mmolimar.kafka.connect.fs.file.reader.FileReader`` interface).
+#. Number of records to process at a time. Non-positive values disable batching.
 
 A more detailed information about these properties can be found :ref:`here<config_options-general>`.
 
diff --git a/docs/source/faq.rst b/docs/source/faq.rst
index 1041bc4..489cde9 100644
--- a/docs/source/faq.rst
+++ b/docs/source/faq.rst
@@ -42,6 +42,9 @@ Obviously, this depends of the files in the FS(s) but having several URIs in
 the connector might be a good idea to adjust the number of tasks
 to process those URIs in parallel ( ``tasks.max`` connector property).
 
+Also, using the properties ``policy.batch_size`` and/or ``file_reader.batch_size``
+in case you have tons of files or files too large might help.
+
 **I removed a file from the FS but the connector is still sending messages
 with the contents of that file.**
 

From 25890da5088bdb00b13b8d5ce928a90d34c8d589 Mon Sep 17 00:00:00 2001
From: Mario Molina <mmolimar@gmail.com>
Date: Mon, 6 Jul 2020 08:56:27 -0500
Subject: [PATCH 19/19] Release version 1.1.0

---
 docker-compose.yml  | 2 +-
 docs/source/conf.py | 4 ++--
 pom.xml             | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index dd07fd5..d03f109 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -45,7 +45,7 @@ services:
       SCHEMA_REGISTRY_KAFKASTORE_CONNECTION_URL: 'zookeeper:2181'
 
   connect-fs:
-    image: mmolimar/kafka-connect-fs:1.1.0-SNAPSHOT
+    image: mmolimar/kafka-connect-fs:1.1.0
     container_name: connect
     depends_on:
       - cp-kafka
diff --git a/docs/source/conf.py b/docs/source/conf.py
index f6edf0c..724450a 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -55,9 +55,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '1.0'
+version = '1.1'
 # The full version, including alpha/beta/rc tags.
-release = '1.0'
+release = '1.1'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/pom.xml b/pom.xml
index 68fad3b..8d1c2dc 100644
--- a/pom.xml
+++ b/pom.xml
@@ -4,7 +4,7 @@
 
     <groupId>com.github.mmolimar.kafka.connect</groupId>
     <artifactId>kafka-connect-fs</artifactId>
-    <version>1.1.0-SNAPSHOT</version>
+    <version>1.1.0</version>
     <packaging>jar</packaging>
 
     <name>kafka-connect-fs</name>