-
Notifications
You must be signed in to change notification settings - Fork 206
/
Copy pathEncodingDetector.cs
111 lines (93 loc) · 3.75 KB
/
EncodingDetector.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
using System;
using System.IO;
using System.Linq;
using System.Text;
namespace FikaAmazonAPI.Utils
{
public class EncodingDetector
{
private static readonly Encoding[] CommonEncodings = {
Encoding.UTF8, Encoding.Unicode, Encoding.BigEndianUnicode,
Encoding.UTF32, Encoding.ASCII, Encoding.GetEncoding("ISO-8859-1")
};
public static Encoding DetectEncoding(string filePath, int sampleSize = 4096)
{
byte[] buffer = File.ReadAllBytes(filePath);
return DetectEncoding(buffer, sampleSize);
}
public static Encoding DetectEncoding(byte[] buffer, int sampleSize = 4096)
{
if (buffer == null || buffer.Length == 0)
throw new ArgumentException("Buffer is empty");
// Schritt 1: Prüfe auf BOM
Encoding bomEncoding = CheckBOM(buffer);
if (bomEncoding != null)
return bomEncoding;
// Schritt 2: Prüfe auf UTF-8
if (IsUTF8(buffer))
return Encoding.UTF8;
// Schritt 3: Prüfe auf bekannte Encodings durch Heuristik
return HeuristicDetection(buffer);
}
private static Encoding CheckBOM(byte[] buffer)
{
if (buffer.Length >= 3 && buffer[0] == 0xEF && buffer[1] == 0xBB && buffer[2] == 0xBF)
return Encoding.UTF8; // UTF-8 mit BOM
if (buffer.Length >= 2 && buffer[0] == 0xFF && buffer[1] == 0xFE)
return Encoding.Unicode; // UTF-16 LE
if (buffer.Length >= 2 && buffer[0] == 0xFE && buffer[1] == 0xFF)
return Encoding.BigEndianUnicode; // UTF-16 BE
if (buffer.Length >= 4 && buffer[0] == 0x00 && buffer[1] == 0x00 && buffer[2] == 0xFE && buffer[3] == 0xFF)
return Encoding.UTF32; // UTF-32 BE
if (buffer.Length >= 4 && buffer[0] == 0xFF && buffer[1] == 0xFE && buffer[2] == 0x00 && buffer[3] == 0x00)
return Encoding.UTF32; // UTF-32 LE
return null;
}
private static bool IsUTF8(byte[] buffer)
{
int i = 0;
while (i < buffer.Length)
{
if (buffer[i] <= 0x7F)
{
i++;
continue;
}
if (buffer[i] >= 0xC2 && buffer[i] <= 0xDF)
{
if (i + 1 < buffer.Length && (buffer[i + 1] & 0xC0) == 0x80)
{
i += 2;
continue;
}
}
else if (buffer[i] >= 0xE0 && buffer[i] <= 0xEF)
{
if (i + 2 < buffer.Length && (buffer[i + 1] & 0xC0) == 0x80 && (buffer[i + 2] & 0xC0) == 0x80)
{
i += 3;
continue;
}
}
else if (buffer[i] >= 0xF0 && buffer[i] <= 0xF4)
{
if (i + 3 < buffer.Length && (buffer[i + 1] & 0xC0) == 0x80 && (buffer[i + 2] & 0xC0) == 0x80 && (buffer[i + 3] & 0xC0) == 0x80)
{
i += 4;
continue;
}
}
return false;
}
return true;
}
private static Encoding HeuristicDetection(byte[] buffer)
{
int asciiCount = buffer.Count(b => b <= 127);
int extendedCount = buffer.Length - asciiCount;
if (extendedCount == 0)
return Encoding.ASCII; // ASCII-Dateien enthalten keine Sonderzeichen
return Encoding.GetEncoding("ISO-8859-1"); // Latin-1 als Fallback für ANSI
}
}
}