CharsetDetector · adimosh · Jul 11, 2022 · Jul 11, 2022 · Jul 11, 2022 · Jul 12, 2022
diff --git a/appveyor.yml b/appveyor.yml
@@ -4,15 +4,15 @@ environment:
 version: 2.0.{build}
 
 clone_folder: c:\utfUnknown
-image: Visual Studio 2019
+image: Visual Studio 2022
 configuration: Release
 platform: Any CPU
 nuget:
   project_feed: true
 init:
 - git config --global core.autocrlf true
 build_script:
-- ps: dotnet build -c Release 
+- ps: dotnet build -c Release
 test_script:
 - ps: cd .\tests\
 - ps: dotnet test

diff --git a/example/ConsoleExample.csproj b/example/ConsoleExample.csproj
@@ -2,7 +2,7 @@
 
   <PropertyGroup>
     <OutputType>Exe</OutputType>
-    <TargetFramework>netcoreapp3.0</TargetFramework>
+    <TargetFramework>net6.0</TargetFramework>
   </PropertyGroup>
 
   <ItemGroup>

diff --git a/src/CharsetDetector.cs b/src/CharsetDetector.cs
@@ -41,16 +41,17 @@
 using System.Collections.Generic;
 using System.IO;
 using System.Linq;
-
+using System.Threading;
+using System.Threading.Tasks;
 using UtfUnknown.Core;
 using UtfUnknown.Core.Probers;
 
 namespace UtfUnknown
 {
     /// <summary>
-    /// Default implementation of charset detection interface. 
+    /// Default implementation of charset detection interface.
     /// The detector can be fed by a System.IO.Stream:
-    /// </summary>                
+    /// </summary>
     public class CharsetDetector
     {
         internal InputState InputState;
@@ -118,7 +119,7 @@ private CharsetDetector()
 
         /// <summary>
         /// Detect the character encoding form this byte array.
-        /// It searchs for BOM from bytes[0].
+        /// It searches for BOM from bytes[0].
         /// </summary>
         /// <param name="bytes">The byte array containing the text</param>
         /// <returns></returns>
@@ -135,8 +136,8 @@ public static DetectionResult DetectFromBytes(byte[] bytes)
         }
 
         /// <summary>
-        /// Detect the character encoding form this byte array. 
-        /// It searchs for BOM from bytes[offset].
+        /// Detect the character encoding form this byte array.
+        /// It searches for BOM from bytes[offset].
         /// </summary>
         /// <param name="bytes">The byte array containing the text</param>
         /// <param name="offset">The zero-based byte offset in buffer at which to begin reading the data from</param>
@@ -166,11 +167,9 @@ public static DetectionResult DetectFromBytes(byte[] bytes, int offset, int len)
             return detector.DataEnd();
         }
 
-#if !NETSTANDARD1_0
-
         /// <summary>
         /// Detect the character encoding by reading the stream.
-        /// 
+        ///
         /// Note: stream position is not reset before and after.
         /// </summary>
         /// <param name="stream">The steam. </param>
@@ -186,7 +185,7 @@ public static DetectionResult DetectFromStream(Stream stream)
 
         /// <summary>
         /// Detect the character encoding by reading the stream.
-        /// 
+        ///
         /// Note: stream position is not reset before and after.
         /// </summary>
         /// <param name="stream">The steam. </param>
@@ -210,6 +209,50 @@ public static DetectionResult DetectFromStream(Stream stream, long? maxBytesToRe
             return detector.DataEnd();
         }
 
+        /// <summary>
+        /// Detect the character encoding by reading the stream.
+        ///
+        /// Note: stream position is not reset before and after.
+        /// </summary>
+        /// <param name="stream">The steam. </param>
+        /// <param name="cancellationToken">The cancellation token for this operation.</param>
+        public static Task<DetectionResult> DetectFromStreamAsync(Stream stream, CancellationToken cancellationToken = default)
+        {
+            if (stream == null)
+            {
+                throw new ArgumentNullException(nameof(stream));
+            }
+
+            return DetectFromStreamAsync(stream, null, cancellationToken);
+        }
+
+        /// <summary>
+        /// Detect the character encoding by reading the stream.
+        ///
+        /// Note: stream position is not reset before and after.
+        /// </summary>
+        /// <param name="stream">The steam. </param>
+        /// <param name="maxBytesToRead">max bytes to read from <paramref name="stream"/>. If <c>null</c>, then no max</param>
+        /// <param name="cancellationToken">The cancellation token for this operation.</param>
+        /// <exception cref="ArgumentOutOfRangeException"><paramref name="maxBytesToRead"/> 0 or lower.</exception>
+        public static async Task<DetectionResult> DetectFromStreamAsync(Stream stream, long? maxBytesToRead, CancellationToken cancellationToken = default)
+        {
+            if (stream == null)
+            {
+                throw new ArgumentNullException(nameof(stream));
+            }
+
+            if (maxBytesToRead <= 0)
+            {
+                throw new ArgumentOutOfRangeException(nameof(maxBytesToRead));
+            }
+
+            var detector = new CharsetDetector();
+
+            await ReadStreamAsync(stream, maxBytesToRead, detector, cancellationToken);
+            return detector.DataEnd();
+        }
+
         private static void ReadStream(Stream stream, long? maxBytes, CharsetDetector detector)
         {
             const int bufferSize = 1024;
@@ -241,6 +284,37 @@ private static void ReadStream(Stream stream, long? maxBytes, CharsetDetector de
             }
         }
 
+        private static async Task ReadStreamAsync(Stream stream, long? maxBytes, CharsetDetector detector, CancellationToken cancellationToken = default)
+        {
+            const int bufferSize = 1024;
+            byte[] buff = new byte[bufferSize];
+            int read;
+            long readTotal = 0;
+
+            var toRead = CalcToRead(maxBytes, readTotal, bufferSize);
+
+            while ((read = await stream.ReadAsync(buff, 0, toRead, cancellationToken)) > 0)
+            {
+                detector.Feed(buff, 0, read);
+
+                if (maxBytes != null)
+                {
+                    readTotal += read;
+                    if (readTotal >= maxBytes)
+                    {
+                        return;
+                    }
+
+                    toRead = CalcToRead(maxBytes, readTotal, bufferSize);
+                }
+
+                if (detector._done)
+                {
+                    return;
+                }
+            }
+        }
+
         private static int CalcToRead(long? maxBytes, long readTotal, int bufferSize)
         {
             if (readTotal + bufferSize > maxBytes)
@@ -287,7 +361,42 @@ public static DetectionResult DetectFromFile(FileInfo file)
             }
         }
 
-#endif // !NETSTANDARD1_0
+        /// <summary>
+        /// Detect the character encoding of this file.
+        /// </summary>
+        /// <param name="filePath">Path to file</param>
+        /// <param name="cancellationToken">The cancellation token for this operation.</param>
+        /// <returns></returns>
+        public static async Task<DetectionResult> DetectFromFileAsync(string filePath, CancellationToken cancellationToken = default)
+        {
+            if (filePath == null)
+            {
+                throw new ArgumentNullException(nameof(filePath));
+            }
+
+            using (FileStream fs = new FileStream(filePath, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
+            {
+                return await DetectFromStreamAsync(fs, cancellationToken);
+            }
+        }
+        /// <summary>
+        /// Detect the character encoding of this file.
+        /// </summary>
+        /// <param name="file">The file</param>
+        /// <param name="cancellationToken">The cancellation token for this operation.</param>
+        /// <returns></returns>
+        public static Task<DetectionResult> DetectFromFileAsync(FileInfo file, CancellationToken cancellationToken = default)
+        {
+            if (file == null)
+            {
+                throw new ArgumentNullException(nameof(file));
+            }
+
+            using (FileStream fs = new FileStream(file.FullName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
+            {
+                return DetectFromStreamAsync(fs, cancellationToken);
+            }
+        }
 
         protected virtual void Feed(byte[] buf, int offset, int len)
         {
@@ -403,10 +512,10 @@ private static string FindCharSetByBom(byte[] buf, int offset, int len)
 
             if (buf0 == 0xEF && buf1 == 0xBB && buf[offset + 2] == 0xBF)
                 return CodepageName.UTF8;
-            
+
             if (len < 4)
                 return null;
-            
+
             //Here, because anyway further more than 3 positions are checked.
             if (buf0 == 0x00 && buf1 == 0x00)
             {
@@ -422,24 +531,24 @@ private static string FindCharSetByBom(byte[] buf, int offset, int len)
             if (buf0 == 0x2B && buf1 == 0x2F && buf[offset + 2] == 0x76)
                 if (buf[offset + 3] == 0x38 || buf[offset + 3] == 0x39 || buf[offset + 3] == 0x2B || buf[offset + 3] == 0x2F)
                     return CodepageName.UTF7;
-            
+
             // Detect GB18030 with bom (see table in https://en.wikipedia.org/wiki/Byte_order_mark)
             // TODO: If you remove this check, GB18030Prober will still be defined as GB18030 -- It's feature or bug?
             if (buf0 == 0x84 && buf1 == 0x31 && buf[offset + 2] == 0x95 && buf[offset + 3] == 0x33)
                 return CodepageName.GB18030;
-            
+
             return null;
         }
 
         /// <summary>
-        /// Notify detector that no further data is available. 
+        /// Notify detector that no further data is available.
         /// </summary>
         private DetectionResult DataEnd()
         {
             if (!_gotData)
             {
-                // we haven't got any data yet, return immediately 
-                // caller program sometimes call DataEnd before anything has 
+                // we haven't got any data yet, return immediately
+                // caller program sometimes call DataEnd before anything has
                 // been sent to detector
                 return new DetectionResult();
             }
@@ -478,7 +587,7 @@ private DetectionResult DataEnd()
             return new DetectionResult();
         }
 
-        internal IList<CharsetProber> GetNewProbers()
+        private IList<CharsetProber> GetNewProbers()
         {
             switch (InputState)
             {
@@ -499,5 +608,4 @@ internal IList<CharsetProber> GetNewProbers()
             }
         }
     }
-}
-
+}
diff --git a/src/DetectionDetail.cs b/src/DetectionDetail.cs
@@ -57,7 +57,7 @@ public DetectionDetail(CharsetProber prober, TimeSpan? time = null)
         public string EncodingName { get; }
 
         /// <summary>
-        /// The detected encoding. 
+        /// The detected encoding.
         /// </summary>
         public Encoding Encoding { get; set; }
 
@@ -101,7 +101,7 @@ internal static Encoding GetEncoding(string encodingShortName)
                 (exception is ArgumentException || // unsupported name
                 exception is NotSupportedException)
             {
-#if NETSTANDARD && !NETSTANDARD1_0 || NETCOREAPP3_0
+#if NETSTANDARD || NET6_0
                 return CodePagesEncodingProvider.Instance.GetEncoding(encodingName);
 #else
                 return null;

diff --git a/src/UTF-unknown.csproj b/src/UTF-unknown.csproj
@@ -1,7 +1,7 @@
 <Project Sdk="Microsoft.NET.Sdk">
 
   <PropertyGroup>
-    <TargetFrameworks>net40;netstandard1.0;netstandard1.3;netstandard2.0;netcoreapp3.0</TargetFrameworks>
+    <TargetFrameworks>net462;net472;netstandard2.0;net6.0</TargetFrameworks>
   </PropertyGroup>
 
   <PropertyGroup>
@@ -16,9 +16,12 @@
     <OutputType>Library</OutputType>
   </PropertyGroup>
 
-  <ItemGroup Condition=" '$(TargetFramework)' == 'netstandard1.3' Or '$(TargetFramework)' == 'netstandard2.0' ">
+  <ItemGroup Condition=" '$(TargetFramework)' == 'netstandard2.0' ">
     <PackageReference Include="System.Text.Encoding.CodePages" Version="4.7.0" />
   </ItemGroup>
+  <ItemGroup Condition=" '$(TargetFramework)' == 'net6.0' ">
+    <PackageReference Include="System.Text.Encoding.CodePages" Version="6.0.0" />
+  </ItemGroup>
   <ItemGroup>
     <PackageReference Include="Microsoft.SourceLink.GitHub" Version="1.1.1" PrivateAssets="All" />
   </ItemGroup>
@@ -42,7 +45,7 @@ Features:
 - XML documentation included
 
  Compared to Ude:
- 
+
  - Refactor of API, namespaces and deadcode removal
  - Added some docs
  - Improve error handling