Skip to content

Commit 072c116

Browse files
committed
encoding detector
1 parent cd438cf commit 072c116

File tree

4 files changed

+97
-29
lines changed

4 files changed

+97
-29
lines changed

EncodingConverter/Converter.cs

+89-24
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
using System.IO;
33
using System.Linq;
44
using System.Text;
5-
using System.Threading.Tasks;
65

76
namespace EncodingConverter;
87

@@ -19,7 +18,7 @@ public static async Task ConvertEncodingAsync(IEnumerable<FileInfo> files, Encod
1918

2019
string fullText;
2120

22-
using (StreamReader sr = new(file.FullName, await getEncodingAsync(file.FullName))) {
21+
using (StreamReader sr = new(file.FullName, detectTextEncoding(file.FullName, out _, 0))) {
2322
fullText = await sr.ReadToEndAsync();
2423
}
2524

@@ -31,29 +30,95 @@ public static async Task ConvertEncodingAsync(IEnumerable<FileInfo> files, Encod
3130
}
3231
}
3332

34-
/// <summary>
35-
/// Determines a text file's encoding by analyzing its byte order mark (BOM).
36-
/// Defaults to ASCII when detection of the text file's endianness fails.
37-
/// </summary>
38-
/// <param name="filename">The text file to analyze.</param>
39-
/// <returns>The detected encoding.</returns>
40-
private static async Task<Encoding> getEncodingAsync(string filename) {
41-
// Read the BOM
42-
var bom = new byte[4];
43-
44-
using (FileStream file = new(filename, FileMode.Open, FileAccess.Read)) {
45-
await file.ReadAsync(bom, 0, 4);
33+
// From https://stackoverflow.com/questions/1025332/determine-a-strings-encoding-in-c-sharp
34+
// Function to detect the encoding for UTF-7, UTF-8/16/32 (bom, no bom, little
35+
// & big endian), and local default codepage, and potentially other codepages.
36+
// 'taster' = number of bytes to check of the file (to save processing). Higher
37+
// value is slower, but more reliable (especially UTF-8 with special characters
38+
// later on may appear to be ASCII initially). If taster = 0, then taster
39+
// becomes the length of the file (for maximum reliability). 'text' is simply
40+
// the string with the discovered encoding applied to the file.
41+
private static Encoding detectTextEncoding(string filename, out string text, int taster = 1000) {
42+
var b = File.ReadAllBytes(filename);
43+
44+
//////////////// First check the low hanging fruit by checking if a
45+
//////////////// BOM/signature exists (sourced from http://www.unicode.org/faq/utf_bom.html#bom4)
46+
if (b.Length >= 4 && b[0] == 0x00 && b[1] == 0x00 && b[2] == 0xFE && b[3] == 0xFF) { text = Encoding.GetEncoding("utf-32BE").GetString(b, 4, b.Length - 4); return Encoding.GetEncoding("utf-32BE"); } // UTF-32, big-endian
47+
else if (b.Length >= 4 && b[0] == 0xFF && b[1] == 0xFE && b[2] == 0x00 && b[3] == 0x00) { text = Encoding.UTF32.GetString(b, 4, b.Length - 4); return Encoding.UTF32; } // UTF-32, little-endian
48+
else if (b.Length >= 2 && b[0] == 0xFE && b[1] == 0xFF) { text = Encoding.BigEndianUnicode.GetString(b, 2, b.Length - 2); return Encoding.BigEndianUnicode; } // UTF-16, big-endian
49+
else if (b.Length >= 2 && b[0] == 0xFF && b[1] == 0xFE) { text = Encoding.Unicode.GetString(b, 2, b.Length - 2); return Encoding.Unicode; } // UTF-16, little-endian
50+
else if (b.Length >= 3 && b[0] == 0xEF && b[1] == 0xBB && b[2] == 0xBF) { text = Encoding.UTF8.GetString(b, 3, b.Length - 3); return Encoding.UTF8; } // UTF-8
51+
else if (b.Length >= 3 && b[0] == 0x2b && b[1] == 0x2f && b[2] == 0x76) { text = Encoding.UTF7.GetString(b, 3, b.Length - 3); return Encoding.UTF7; } // UTF-7
52+
53+
54+
//////////// If the code reaches here, no BOM/signature was found, so now
55+
//////////// we need to 'taste' the file to see if can manually discover
56+
//////////// the encoding. A high taster value is desired for UTF-8
57+
if (taster == 0 || taster > b.Length) taster = b.Length; // Taster size can't be bigger than the filesize obviously.
58+
59+
60+
// Some text files are encoded in UTF8, but have no BOM/signature. Hence
61+
// the below manually checks for a UTF8 pattern. This code is based off
62+
// the top answer at: https://stackoverflow.com/questions/6555015/check-for-invalid-utf8
63+
// For our purposes, an unnecessarily strict (and terser/slower)
64+
// implementation is shown at: https://stackoverflow.com/questions/1031645/how-to-detect-utf-8-in-plain-c
65+
// For the below, false positives should be exceedingly rare (and would
66+
// be either slightly malformed UTF-8 (which would suit our purposes
67+
// anyway) or 8-bit extended ASCII/UTF-16/32 at a vanishingly long shot).
68+
int i = 0;
69+
bool utf8 = false;
70+
while (i < taster - 4) {
71+
if (b[i] <= 0x7F) { i += 1; continue; } // If all characters are below 0x80, then it is valid UTF8, but UTF8 is not 'required' (and therefore the text is more desirable to be treated as the default codepage of the computer). Hence, there's no "utf8 = true;" code unlike the next three checks.
72+
if (b[i] >= 0xC2 && b[i] < 0xE0 && b[i + 1] >= 0x80 && b[i + 1] < 0xC0) { i += 2; utf8 = true; continue; }
73+
if (b[i] >= 0xE0 && b[i] < 0xF0 && b[i + 1] >= 0x80 && b[i + 1] < 0xC0 && b[i + 2] >= 0x80 && b[i + 2] < 0xC0) { i += 3; utf8 = true; continue; }
74+
if (b[i] >= 0xF0 && b[i] < 0xF5 && b[i + 1] >= 0x80 && b[i + 1] < 0xC0 && b[i + 2] >= 0x80 && b[i + 2] < 0xC0 && b[i + 3] >= 0x80 && b[i + 3] < 0xC0) { i += 4; utf8 = true; continue; }
75+
utf8 = false; break;
76+
}
77+
if (utf8) {
78+
text = Encoding.UTF8.GetString(b);
79+
return Encoding.UTF8;
4680
}
4781

48-
// Analyze the BOM
49-
return bom[0] switch {
50-
0x2b when bom[1] == 0x2f && bom[2] == 0x76 => Encoding.UTF7,
51-
0xef when bom[1] == 0xbb && bom[2] == 0xbf => Encoding.UTF8,
52-
0xff when bom[1] == 0xfe && bom[2] == 0 && bom[3] == 0 => Encoding.UTF32, // UTF-32LE
53-
0xff when bom[1] == 0xfe => Encoding.Unicode, // UTF-16LE
54-
0xfe when bom[1] == 0xff => Encoding.BigEndianUnicode, // UTF-16BE
55-
0 when bom[1] == 0 && bom[2] == 0xfe && bom[3] == 0xff => new UTF32Encoding(true, true), // UTF-32BE
56-
_ => Encoding.Default, // We actually have no idea what the encoding is if we reach this point, so
57-
}; // you may wish to return null instead of defaulting to ASCII
82+
83+
// The next check is a heuristic attempt to detect UTF-16 without a BOM.
84+
// We simply look for zeroes in odd or even byte places, and if a certain
85+
// threshold is reached, the code is 'probably' UF-16.
86+
double threshold = 0.1; // proportion of chars step 2 which must be zeroed to be diagnosed as utf-16. 0.1 = 10%
87+
int count = 0;
88+
for (int n = 0; n < taster; n += 2) if (b[n] == 0) count++;
89+
if (((double)count) / taster > threshold) { text = Encoding.BigEndianUnicode.GetString(b); return Encoding.BigEndianUnicode; }
90+
count = 0;
91+
for (int n = 1; n < taster; n += 2) if (b[n] == 0) count++;
92+
if (((double)count) / taster > threshold) { text = Encoding.Unicode.GetString(b); return Encoding.Unicode; } // (little-endian)
93+
94+
95+
// Finally, a long shot - let's see if we can find "charset=xyz" or
96+
// "encoding=xyz" to identify the encoding:
97+
for (int n = 0; n < taster - 9; n++) {
98+
if (
99+
((b[n + 0] == 'c' || b[n + 0] == 'C') && (b[n + 1] == 'h' || b[n + 1] == 'H') && (b[n + 2] == 'a' || b[n + 2] == 'A') && (b[n + 3] == 'r' || b[n + 3] == 'R') && (b[n + 4] == 's' || b[n + 4] == 'S') && (b[n + 5] == 'e' || b[n + 5] == 'E') && (b[n + 6] == 't' || b[n + 6] == 'T') && (b[n + 7] == '=')) ||
100+
((b[n + 0] == 'e' || b[n + 0] == 'E') && (b[n + 1] == 'n' || b[n + 1] == 'N') && (b[n + 2] == 'c' || b[n + 2] == 'C') && (b[n + 3] == 'o' || b[n + 3] == 'O') && (b[n + 4] == 'd' || b[n + 4] == 'D') && (b[n + 5] == 'i' || b[n + 5] == 'I') && (b[n + 6] == 'n' || b[n + 6] == 'N') && (b[n + 7] == 'g' || b[n + 7] == 'G') && (b[n + 8] == '='))
101+
) {
102+
if (b[n + 0] == 'c' || b[n + 0] == 'C') n += 8; else n += 9;
103+
if (b[n] == '"' || b[n] == '\'') n++;
104+
int oldn = n;
105+
while (n < taster && (b[n] == '_' || b[n] == '-' || (b[n] >= '0' && b[n] <= '9') || (b[n] >= 'a' && b[n] <= 'z') || (b[n] >= 'A' && b[n] <= 'Z'))) { n++; }
106+
byte[] nb = new byte[n - oldn];
107+
Array.Copy(b, oldn, nb, 0, n - oldn);
108+
try {
109+
string internalEnc = Encoding.ASCII.GetString(nb);
110+
text = Encoding.GetEncoding(internalEnc).GetString(b);
111+
return Encoding.GetEncoding(internalEnc);
112+
} catch { break; } // If C# doesn't recognize the name of the encoding, break.
113+
}
114+
}
115+
116+
117+
// If all else fails, the encoding is probably (though certainly not
118+
// definitely) the user's local codepage! One might present to the user a
119+
// list of alternative encodings as shown here: https://stackoverflow.com/questions/8509339/what-is-the-most-common-encoding-of-each-language
120+
// A full list can be found using Encoding.GetEncodings
121+
text = Encoding.Default.GetString(b);
122+
return Encoding.Default;
58123
}
59124
}

EncodingConverter/Dialogs/ChooseEncodingDialog.xaml.cs

+6-3
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,15 @@ namespace EncodingConverter;
77
/// ChooseEncodingDialog.xaml에 대한 상호 작용 논리
88
/// </summary>
99
public partial class ChooseEncodingDialog {
10+
private static readonly string[] encodings = ["System Encoding", "UTF-8 with BOM", "UTF-8 without BOM"];
11+
private static readonly UTF8Encoding utf8withBom = new(encoderShouldEmitUTF8Identifier: true);
12+
private static readonly UTF8Encoding utf8withoutBom = new(encoderShouldEmitUTF8Identifier: false);
1013
//private const int utf8CodePage = 65001;
1114

1215
public ChooseEncodingDialog() {
1316
InitializeComponent();
1417
//encodingsCombo.ItemsSource = Encoding.GetEncodings().Select(ei => $"{ei.DisplayName} ({ei.CodePage})");
15-
encodingsCombo.ItemsSource = new string[] { "System Encoding", "UTF-8 with BOM", "UTF-8 without BOM" };
18+
encodingsCombo.ItemsSource = encodings;
1619
encodingsCombo.SelectedIndex = encodingsCombo.Items.Count - 1;
1720
}
1821

@@ -24,8 +27,8 @@ public Encoding ChosenEncoding {
2427

2528
return encodingsCombo.SelectedIndex switch {
2629
0 => Encoding.Default,
27-
1 => new UTF8Encoding(encoderShouldEmitUTF8Identifier: true),
28-
2 => new UTF8Encoding(encoderShouldEmitUTF8Identifier: false),
30+
1 => utf8withBom,
31+
2 => utf8withoutBom,
2932
_ => throw new InvalidOperationException(),
3033
};
3134
}

EncodingConverter/source.extension.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ internal sealed partial class Vsix
99
{
1010
public const string Id = "EncodingConverter.625f3018-1e5f-4b92-ad9c-08d2534b6021";
1111
public const string Name = "Encoding Converter";
12-
public const string Description = @"An extension that allows you to easily convert the encoding of multiple files.";
12+
public const string Description = @"An extension that allows you to easily convert the encoding of multiple files. (System Encoding / UTF-8 with / without BOM)";
1313
public const string Language = "en-US";
1414
public const string Version = "1.0.1";
1515
public const string Author = "Bluehill";

EncodingConverter/source.extension.vsixmanifest

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
<Metadata>
44
<Identity Id="EncodingConverter.625f3018-1e5f-4b92-ad9c-08d2534b6021" Version="1.0.1" Language="en-US" Publisher="Bluehill" />
55
<DisplayName>Encoding Converter</DisplayName>
6-
<Description xml:space="preserve">An extension that allows you to easily convert the encoding of multiple files.</Description>
6+
<Description xml:space="preserve">An extension that allows you to easily convert the encoding of multiple files. (System Encoding / UTF-8 with / without BOM)</Description>
77
<License>..\LICENSE.txt</License>
88
<Icon>Resources\Icon.png</Icon>
99
<PreviewImage>Resources\Icon.png</PreviewImage>

0 commit comments

Comments
 (0)