From 4f898ef50c2c767cf45406d9884d62206fa2564d Mon Sep 17 00:00:00 2001 From: Ivandro Jao Date: Sun, 3 Nov 2024 09:36:11 +0000 Subject: [PATCH 1/3] Refactor comma correction logic Consolidate Arabic comma regex replacements into a lazy-initialized dictionary to enhance maintainability and readability. Simplify redundant regex instantiation by using a loop-based approach. Also, optimize the method for removing extra commas before sentence-ending characters by using a character array. Signed-off-by: Ivandro Jao --- src/libse/Forms/FixCommonErrors/FixCommas.cs | 69 +++++++++++--------- 1 file changed, 38 insertions(+), 31 deletions(-) diff --git a/src/libse/Forms/FixCommonErrors/FixCommas.cs b/src/libse/Forms/FixCommonErrors/FixCommas.cs index 53a369cfbf..8bdec8b7dd 100644 --- a/src/libse/Forms/FixCommonErrors/FixCommas.cs +++ b/src/libse/Forms/FixCommonErrors/FixCommas.cs @@ -1,4 +1,5 @@ using System; +using System.Collections.Generic; using Nikse.SubtitleEdit.Core.Interfaces; using System.Text.RegularExpressions; using Nikse.SubtitleEdit.Core.Common; @@ -12,6 +13,15 @@ public static class Language public static string FixCommas { get; set; } = "Fix commas"; } + private static readonly Lazy> LazyCommaRegex = new Lazy>(() => new Dictionary + { + { new Regex(@"([\p{L}\d\s]) *،،([\p{L}\d\s])", RegexOptions.Compiled), "$1،$2" }, + { new Regex(@"([\p{L}\d\s]) *، *، *،([\p{L}\d\s])", RegexOptions.Compiled), "$1...$2" }, + { new Regex(@"([\p{L}\d\s]) *، *، *،$", RegexOptions.Compiled), "$1..." }, + { new Regex(@"([\p{L}\d\s]) *،\s+،([\p{L}\d\s])", RegexOptions.Compiled), "$1،$2" }, + { new Regex(@"،(\p{L})", RegexOptions.Compiled), "، $1" }, + }); + public void Fix(Subtitle subtitle, IFixCallbacks callbacks) { var commaDouble = new Regex(@"([\p{L}\d\s]),,([\p{L}\d\s])"); @@ -30,7 +40,7 @@ public void Fix(Subtitle subtitle, IFixCallbacks callbacks) var s = p.Text; var oldText = s; - if (p.Text.IndexOf(',') >= 0) + if (p.Text.Contains(',')) { s = commaDouble.Replace(s, "$1,$2"); s = commaTriple.Replace(s, "$1...$2"); @@ -38,7 +48,7 @@ public void Fix(Subtitle subtitle, IFixCallbacks callbacks) s = commaWhiteSpaceBetween.Replace(s, "$1,$2"); var match = commaFollowedByLetter.Match(s); - if (match.Success && (!(match.Index > 0 && s[match.Index-1] == 'ό' && s.Substring(match.Index).StartsWith(",τι", StringComparison.OrdinalIgnoreCase)) || callbacks.Language != "el")) + if (match.Success && (!(match.Index > 0 && s[match.Index - 1] == 'ό' && s.Substring(match.Index).StartsWith(",τι", StringComparison.OrdinalIgnoreCase)) || callbacks.Language != "el")) { s = commaFollowedByLetter.Replace(s, ", $1"); } @@ -46,18 +56,12 @@ public void Fix(Subtitle subtitle, IFixCallbacks callbacks) s = RemoveCommaBeforeSentenceEndingChar(s, ','); } - if (p.Text.IndexOf('،') >= 0) + if (p.Text.Contains('،')) { - var commaDoubleAr = new Regex(@"([\p{L}\d\s]) *،،([\p{L}\d\s])"); - var commaTripleAr = new Regex(@"([\p{L}\d\s]) *، *، *،([\p{L}\d\s])"); - var commaTripleEndOfLineAr = new Regex(@"([\p{L}\d\s]) *، *، *،$"); - var commaWhiteSpaceBetweenAr = new Regex(@"([\p{L}\d\s]) *،\s+،([\p{L}\d\s])"); - var commaFollowedByLetterAr = new Regex(@"،(\p{L})"); - s = commaDoubleAr.Replace(s, "$1،$2"); - s = commaTripleAr.Replace(s, "$1...$2"); - s = commaTripleEndOfLineAr.Replace(s, "$1..."); - s = commaWhiteSpaceBetweenAr.Replace(s, "$1،$2"); - s = commaFollowedByLetterAr.Replace(s, "، $1"); + foreach (var keyValue in LazyCommaRegex.Value) + { + s = keyValue.Key.Replace(s, keyValue.Value); + } s = RemoveCommaBeforeSentenceEndingChar(s, '،'); } @@ -75,30 +79,33 @@ public void Fix(Subtitle subtitle, IFixCallbacks callbacks) private static string RemoveCommaBeforeSentenceEndingChar(string input, char comma) { - var s = input; - for (var i = s.Length - 1; i >= 0; i--) + // foo,,,,! bar => foo! bar + var len = input.Length; + var chars = input.ToCharArray(); + int pos = 0; + for (int i = 0; i < len; i++) { - var ch = s[i]; - if (i - 1 >= 0 && s[i - 1] == comma && IsSentenceEndingChar(ch)) + var ch = input[i]; + if (ch == comma && IsNextCharSentenceClosingSymbol(i, chars)) { - var k = i; + continue; + } - do - { - i--; - } while (i - 1 >= 0 && s[i - 1] == comma); + chars[pos++] = ch; + } - // remove commas - if (k - i > 0) - { - s = s.Remove(i, k - i); - } + return new string(chars, 0, pos); + + bool IsNextCharSentenceClosingSymbol(int index, char[] characters) + { + if (index + 1 < characters.Length) + { + var nextChar = characters[index + 1]; + return nextChar == '.' || nextChar == '?' || nextChar == '!' || nextChar == '('; } - } - return s; + return false; + } } - - private static bool IsSentenceEndingChar(char ch) => ch == '.' || ch == '!' || ch == '?' || ch == ')' || ch == ']' || ch == '؟'; } } From dedbe0b878e917a42c3d11d6ab8e7346202009e5 Mon Sep 17 00:00:00 2001 From: Ivandro Jao Date: Sun, 3 Nov 2024 09:40:17 +0000 Subject: [PATCH 2/3] Fix comma handling near parentheses and brackets Removed unnecessary checks for opening parentheses and brackets following commas. This prevents inappropriate fixes where commas precede these characters. Signed-off-by: Ivandro Jao --- src/libse/Forms/FixCommonErrors/FixCommas.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libse/Forms/FixCommonErrors/FixCommas.cs b/src/libse/Forms/FixCommonErrors/FixCommas.cs index 8bdec8b7dd..454877aec3 100644 --- a/src/libse/Forms/FixCommonErrors/FixCommas.cs +++ b/src/libse/Forms/FixCommonErrors/FixCommas.cs @@ -101,7 +101,7 @@ bool IsNextCharSentenceClosingSymbol(int index, char[] characters) if (index + 1 < characters.Length) { var nextChar = characters[index + 1]; - return nextChar == '.' || nextChar == '?' || nextChar == '!' || nextChar == '('; + return nextChar == '.' || nextChar == '?' || nextChar == '!' || nextChar == ')' || nextChar == ']' || nextChar == '؟'; } return false; From b91a281e8517beeee4ce0dc83392b93f78f9258a Mon Sep 17 00:00:00 2001 From: Ivandro Jao Date: Sun, 3 Nov 2024 12:10:26 +0000 Subject: [PATCH 3/3] Expose NeutralSentenceEndingChars and optimize method in FixCommas Make NeutralSentenceEndingChars publicly accessible by changing its access modifier to public. Also, refactor the IsNextCharSentenceClosingSymbol method in FixCommas to utilize the NeutralSentenceEndingChars for more concise and efficient logic. Signed-off-by: Ivandro Jao --- src/libse/Common/StringExtensions.cs | 2 +- src/libse/Forms/FixCommonErrors/FixCommas.cs | 9 ++------- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/src/libse/Common/StringExtensions.cs b/src/libse/Common/StringExtensions.cs index d7c8c2cec9..fc563be64e 100644 --- a/src/libse/Common/StringExtensions.cs +++ b/src/libse/Common/StringExtensions.cs @@ -771,7 +771,7 @@ public static bool HasSentenceEnding(this string value) return value.HasSentenceEnding(string.Empty); } - private static readonly HashSet NeutralSentenceEndingChars = new HashSet + public static readonly IReadOnlyCollection NeutralSentenceEndingChars = new HashSet { '.', '!', '?', ']', ')', '…', '♪', '؟', '。', '?' }; diff --git a/src/libse/Forms/FixCommonErrors/FixCommas.cs b/src/libse/Forms/FixCommonErrors/FixCommas.cs index 454877aec3..6cc1427e8d 100644 --- a/src/libse/Forms/FixCommonErrors/FixCommas.cs +++ b/src/libse/Forms/FixCommonErrors/FixCommas.cs @@ -1,5 +1,6 @@ using System; using System.Collections.Generic; +using System.Linq; using Nikse.SubtitleEdit.Core.Interfaces; using System.Text.RegularExpressions; using Nikse.SubtitleEdit.Core.Common; @@ -98,13 +99,7 @@ private static string RemoveCommaBeforeSentenceEndingChar(string input, char com bool IsNextCharSentenceClosingSymbol(int index, char[] characters) { - if (index + 1 < characters.Length) - { - var nextChar = characters[index + 1]; - return nextChar == '.' || nextChar == '?' || nextChar == '!' || nextChar == ')' || nextChar == ']' || nextChar == '؟'; - } - - return false; + return index + 1 < characters.Length && StringExtensions.NeutralSentenceEndingChars.Contains(characters[index + 1]); } } }