Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Async operations + updated frameworks #158

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,15 @@ environment:
version: 2.0.{build}

clone_folder: c:\utfUnknown
image: Visual Studio 2019
image: Visual Studio 2022
configuration: Release
platform: Any CPU
nuget:
project_feed: true
init:
- git config --global core.autocrlf true
build_script:
- ps: dotnet build -c Release
- ps: dotnet build -c Release
test_script:
- ps: cd .\tests\
- ps: dotnet test
Expand Down
2 changes: 1 addition & 1 deletion example/ConsoleExample.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>netcoreapp3.0</TargetFramework>
<TargetFramework>net6.0</TargetFramework>
</PropertyGroup>

<ItemGroup>
Expand Down
150 changes: 129 additions & 21 deletions src/CharsetDetector.cs
Original file line number Diff line number Diff line change
Expand Up @@ -41,16 +41,17 @@
using System.Collections.Generic;
using System.IO;
using System.Linq;

using System.Threading;
using System.Threading.Tasks;
using UtfUnknown.Core;
using UtfUnknown.Core.Probers;

namespace UtfUnknown
{
/// <summary>
/// Default implementation of charset detection interface.
/// Default implementation of charset detection interface.
/// The detector can be fed by a System.IO.Stream:
/// </summary>
/// </summary>
public class CharsetDetector
{
internal InputState InputState;
Expand Down Expand Up @@ -118,7 +119,7 @@ private CharsetDetector()

/// <summary>
/// Detect the character encoding form this byte array.
/// It searchs for BOM from bytes[0].
/// It searches for BOM from bytes[0].
/// </summary>
/// <param name="bytes">The byte array containing the text</param>
/// <returns></returns>
Expand All @@ -135,8 +136,8 @@ public static DetectionResult DetectFromBytes(byte[] bytes)
}

/// <summary>
/// Detect the character encoding form this byte array.
/// It searchs for BOM from bytes[offset].
/// Detect the character encoding form this byte array.
/// It searches for BOM from bytes[offset].
/// </summary>
/// <param name="bytes">The byte array containing the text</param>
/// <param name="offset">The zero-based byte offset in buffer at which to begin reading the data from</param>
Expand Down Expand Up @@ -166,11 +167,9 @@ public static DetectionResult DetectFromBytes(byte[] bytes, int offset, int len)
return detector.DataEnd();
}

#if !NETSTANDARD1_0

/// <summary>
/// Detect the character encoding by reading the stream.
///
///
/// Note: stream position is not reset before and after.
/// </summary>
/// <param name="stream">The steam. </param>
Expand All @@ -186,7 +185,7 @@ public static DetectionResult DetectFromStream(Stream stream)

/// <summary>
/// Detect the character encoding by reading the stream.
///
///
/// Note: stream position is not reset before and after.
/// </summary>
/// <param name="stream">The steam. </param>
Expand All @@ -210,6 +209,50 @@ public static DetectionResult DetectFromStream(Stream stream, long? maxBytesToRe
return detector.DataEnd();
}

/// <summary>
/// Detect the character encoding by reading the stream.
///
/// Note: stream position is not reset before and after.
/// </summary>
/// <param name="stream">The steam. </param>
/// <param name="cancellationToken">The cancellation token for this operation.</param>
public static Task<DetectionResult> DetectFromStreamAsync(Stream stream, CancellationToken cancellationToken = default)
{
if (stream == null)
{
throw new ArgumentNullException(nameof(stream));
}

return DetectFromStreamAsync(stream, null, cancellationToken);
}

/// <summary>
/// Detect the character encoding by reading the stream.
///
/// Note: stream position is not reset before and after.
/// </summary>
/// <param name="stream">The steam. </param>
/// <param name="maxBytesToRead">max bytes to read from <paramref name="stream"/>. If <c>null</c>, then no max</param>
/// <param name="cancellationToken">The cancellation token for this operation.</param>
/// <exception cref="ArgumentOutOfRangeException"><paramref name="maxBytesToRead"/> 0 or lower.</exception>
public static async Task<DetectionResult> DetectFromStreamAsync(Stream stream, long? maxBytesToRead, CancellationToken cancellationToken = default)
{
if (stream == null)
{
throw new ArgumentNullException(nameof(stream));
}

if (maxBytesToRead <= 0)
{
throw new ArgumentOutOfRangeException(nameof(maxBytesToRead));
}

var detector = new CharsetDetector();

await ReadStreamAsync(stream, maxBytesToRead, detector, cancellationToken);
return detector.DataEnd();
}

private static void ReadStream(Stream stream, long? maxBytes, CharsetDetector detector)
{
const int bufferSize = 1024;
Expand Down Expand Up @@ -241,6 +284,37 @@ private static void ReadStream(Stream stream, long? maxBytes, CharsetDetector de
}
}

private static async Task ReadStreamAsync(Stream stream, long? maxBytes, CharsetDetector detector, CancellationToken cancellationToken = default)
{
const int bufferSize = 1024;
byte[] buff = new byte[bufferSize];
int read;
long readTotal = 0;

var toRead = CalcToRead(maxBytes, readTotal, bufferSize);

while ((read = await stream.ReadAsync(buff, 0, toRead, cancellationToken)) > 0)
{
detector.Feed(buff, 0, read);
adimosh marked this conversation as resolved.
Show resolved Hide resolved

if (maxBytes != null)
{
readTotal += read;
if (readTotal >= maxBytes)
{
return;
}

toRead = CalcToRead(maxBytes, readTotal, bufferSize);
}

if (detector._done)
{
return;
}
}
}

private static int CalcToRead(long? maxBytes, long readTotal, int bufferSize)
{
if (readTotal + bufferSize > maxBytes)
Expand Down Expand Up @@ -287,7 +361,42 @@ public static DetectionResult DetectFromFile(FileInfo file)
}
}

#endif // !NETSTANDARD1_0
/// <summary>
/// Detect the character encoding of this file.
/// </summary>
/// <param name="filePath">Path to file</param>
/// <param name="cancellationToken">The cancellation token for this operation.</param>
/// <returns></returns>
public static async Task<DetectionResult> DetectFromFileAsync(string filePath, CancellationToken cancellationToken = default)
{
if (filePath == null)
{
throw new ArgumentNullException(nameof(filePath));
}

using (FileStream fs = new FileStream(filePath, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
{
return await DetectFromStreamAsync(fs, cancellationToken);
}
}
/// <summary>
/// Detect the character encoding of this file.
/// </summary>
/// <param name="file">The file</param>
/// <param name="cancellationToken">The cancellation token for this operation.</param>
/// <returns></returns>
public static Task<DetectionResult> DetectFromFileAsync(FileInfo file, CancellationToken cancellationToken = default)
{
if (file == null)
{
throw new ArgumentNullException(nameof(file));
}

using (FileStream fs = new FileStream(file.FullName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
adimosh marked this conversation as resolved.
Show resolved Hide resolved
{
return DetectFromStreamAsync(fs, cancellationToken);
}
}

protected virtual void Feed(byte[] buf, int offset, int len)
{
Expand Down Expand Up @@ -403,10 +512,10 @@ private static string FindCharSetByBom(byte[] buf, int offset, int len)

if (buf0 == 0xEF && buf1 == 0xBB && buf[offset + 2] == 0xBF)
return CodepageName.UTF8;

if (len < 4)
return null;

//Here, because anyway further more than 3 positions are checked.
if (buf0 == 0x00 && buf1 == 0x00)
{
Expand All @@ -422,24 +531,24 @@ private static string FindCharSetByBom(byte[] buf, int offset, int len)
if (buf0 == 0x2B && buf1 == 0x2F && buf[offset + 2] == 0x76)
if (buf[offset + 3] == 0x38 || buf[offset + 3] == 0x39 || buf[offset + 3] == 0x2B || buf[offset + 3] == 0x2F)
return CodepageName.UTF7;

// Detect GB18030 with bom (see table in https://en.wikipedia.org/wiki/Byte_order_mark)
// TODO: If you remove this check, GB18030Prober will still be defined as GB18030 -- It's feature or bug?
if (buf0 == 0x84 && buf1 == 0x31 && buf[offset + 2] == 0x95 && buf[offset + 3] == 0x33)
return CodepageName.GB18030;

return null;
}

/// <summary>
/// Notify detector that no further data is available.
/// Notify detector that no further data is available.
/// </summary>
private DetectionResult DataEnd()
{
if (!_gotData)
{
// we haven't got any data yet, return immediately
// caller program sometimes call DataEnd before anything has
// we haven't got any data yet, return immediately
// caller program sometimes call DataEnd before anything has
// been sent to detector
return new DetectionResult();
}
Expand Down Expand Up @@ -478,7 +587,7 @@ private DetectionResult DataEnd()
return new DetectionResult();
}

internal IList<CharsetProber> GetNewProbers()
private IList<CharsetProber> GetNewProbers()
{
switch (InputState)
{
Expand All @@ -499,5 +608,4 @@ internal IList<CharsetProber> GetNewProbers()
}
}
}
}

}
4 changes: 2 additions & 2 deletions src/DetectionDetail.cs
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ public DetectionDetail(CharsetProber prober, TimeSpan? time = null)
public string EncodingName { get; }

/// <summary>
/// The detected encoding.
/// The detected encoding.
/// </summary>
public Encoding Encoding { get; set; }

Expand Down Expand Up @@ -101,7 +101,7 @@ internal static Encoding GetEncoding(string encodingShortName)
(exception is ArgumentException || // unsupported name
exception is NotSupportedException)
{
#if NETSTANDARD && !NETSTANDARD1_0 || NETCOREAPP3_0
#if NETSTANDARD || NET6_0
return CodePagesEncodingProvider.Instance.GetEncoding(encodingName);
#else
return null;
Expand Down
9 changes: 6 additions & 3 deletions src/UTF-unknown.csproj
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFrameworks>net40;netstandard1.0;netstandard1.3;netstandard2.0;netcoreapp3.0</TargetFrameworks>
<TargetFrameworks>net462;net472;netstandard2.0;net6.0</TargetFrameworks>
adimosh marked this conversation as resolved.
Show resolved Hide resolved
</PropertyGroup>

<PropertyGroup>
Expand All @@ -16,9 +16,12 @@
<OutputType>Library</OutputType>
</PropertyGroup>

<ItemGroup Condition=" '$(TargetFramework)' == 'netstandard1.3' Or '$(TargetFramework)' == 'netstandard2.0' ">
<ItemGroup Condition=" '$(TargetFramework)' == 'netstandard2.0' ">
<PackageReference Include="System.Text.Encoding.CodePages" Version="4.7.0" />
</ItemGroup>
<ItemGroup Condition=" '$(TargetFramework)' == 'net6.0' ">
<PackageReference Include="System.Text.Encoding.CodePages" Version="6.0.0" />
</ItemGroup>
<ItemGroup>
<PackageReference Include="Microsoft.SourceLink.GitHub" Version="1.1.1" PrivateAssets="All" />
</ItemGroup>
Expand All @@ -42,7 +45,7 @@ Features:
- XML documentation included

Compared to Ude:

- Refactor of API, namespaces and deadcode removal
- Added some docs
- Improve error handling
Expand Down
Loading