Skip to content
This repository was archived by the owner on Nov 19, 2019. It is now read-only.

You guys should consider using this branch #1

Open
wants to merge 13 commits into
base: mozilla-hg-148372
Choose a base branch
from
61 changes: 58 additions & 3 deletions DetectorTest.m
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
#import <Foundation/Foundation.h>
#import <AppKit/AppKit.h>

#import <UniversalDetector/UniversalDetector.h>

int main(int argc,char **argv)
{
NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init];

[[NSUserDefaults standardUserDefaults] setBool:YES
forKey:UniversalDetectorUseMacRomanHeuristic];

NSError *error = nil;

for (int i = 1; i < argc; i++)
Expand All @@ -20,6 +24,11 @@ int main(int argc,char **argv)
options:0
error:&error];

if (data == nil) {
NSLog(@"%@", error);
continue;
}

NSString *str = nil;

if (data == nil) {
Expand All @@ -38,19 +47,65 @@ int main(int argc,char **argv)
[detector analyzeData:data];
NSString *MIMECharsetName = [detector MIMECharset];
NSStringEncoding encoding = [detector encoding];
NSStringEncoding appKitEncoding = 0;

//if (encoding == NSWindowsCP1252StringEncoding || encoding == NSShiftJISStringEncoding)
{
NSDictionary *documentAttributes = nil;

// UniversalDetector does not differentiate between Windows Latin 1 and Mac Roman
// while AppKit has an apparent Mac Roman bias.
NSAttributedString *text = [[NSAttributedString alloc] initWithData:data
options:nil
documentAttributes:&documentAttributes
error:&error];

if (text == nil) {
NSLog(@"%@", error);
continue;
}
else {
[text release];

NSNumber *encodingNumber = documentAttributes[NSCharacterEncodingDocumentAttribute];
appKitEncoding = [encodingNumber intValue];
}
}

NSString *appKitResultString = nil;
if (appKitEncoding != 0) {
if (appKitEncoding != encoding) {
appKitResultString = [NSString stringWithFormat:@"\"%@\"",
[NSString localizedNameOfStringEncoding:appKitEncoding]
];
}
else {
appKitResultString = @"(same result)";
}
}

str = [NSString stringWithFormat:@"%@\n\t\"%@\" (%@) confidence: %.1f%%",
str = [NSString stringWithFormat:
@"%@\n"
"\t" "\"%@\" (%@)\n"
"\t" "confidence:% 6.1f%%"
@"\n"
"\t" "AppKit: %@",
fileName,
(encoding != 0) ? [NSString localizedNameOfStringEncoding:encoding] : @"UNKNOWN",
(MIMECharsetName != nil) ? MIMECharsetName : @"UNKNOWN",
([detector confidence] * 100.0f)
([detector confidence] * 100.0f),
(appKitResultString != nil) ? appKitResultString : @"UNDEFINED"
];



printf("%s\n\n", [str UTF8String]);


[detector release];
}

[pool release];
return 0;

return EXIT_SUCCESS;
}
4 changes: 2 additions & 2 deletions Info.plist
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple Computer//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>CFBundleDevelopmentRegion</key>
Expand All @@ -9,7 +9,7 @@
<key>CFBundleIconFile</key>
<string></string>
<key>CFBundleIdentifier</key>
<string>org.mozilla.universalchardet</string>
<string>$(PRODUCT_BUNDLE_IDENTIFIER)</string>
<key>CFBundleInfoDictionaryVersion</key>
<string>6.0</string>
<key>CFBundleName</key>
Expand Down
5 changes: 5 additions & 0 deletions UniversalDetector.h
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
#import <Foundation/Foundation.h>

// You can enable this heuristic by setting the BOOL with that key in NSUserDefaults -standardUserDefaults to YES.
// In this case, only -encoding wll be valid and -MIMECharset will be invalid.
extern NSString * const UniversalDetectorUseMacRomanHeuristic;

@interface UniversalDetector:NSObject
{
void *detectorPtr;
NSString *charsetName;
float confidence;
BOOL possiblyMacRoman;
}

-(void)analyzeContentsOfFile:(NSString *)path;
Expand Down
49 changes: 48 additions & 1 deletion UniversalDetector.m
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
#import "UniversalDetector.h"
#import "WrappedUniversalDetector.h"


NSString * const UniversalDetectorUseMacRomanHeuristic = @"UniversalDetectorUseMacRomanHeuristic";


@implementation UniversalDetector

-(id)init
Expand Down Expand Up @@ -41,6 +45,39 @@ -(void)analyzeData:(NSData *)data
-(void)analyzeBytes:(const char *)data length:(int)len
{
UniversalDetectorHandleData(detectorPtr, data, len);

BOOL useMacRomanHeuristic = [[NSUserDefaults standardUserDefaults] boolForKey:UniversalDetectorUseMacRomanHeuristic];

if (useMacRomanHeuristic) {
// Search for a carriage return (cr) without a following newline.
// We do this to determine, if the data could possibly be MacRoman.
const size_t searchWindowSize = 4096;
char *crPtr = memchr(data, '\r', MIN(len, searchWindowSize));
if (crPtr == NULL) {
possiblyMacRoman = NO;
}
else {
const int lastIndex = len - 1;
int crIndex = (crPtr - data);

// Check, if we are at least one byte before the end.
if (crIndex < lastIndex) {
if (data[crIndex+1] == '\n') {
possiblyMacRoman = NO;
}
else {
possiblyMacRoman = YES;
}
}
else {
possiblyMacRoman = YES;
}
}
}
else {
possiblyMacRoman = NO;
}

[charsetName release];
charsetName=nil;
}
Expand Down Expand Up @@ -77,8 +114,18 @@ -(NSStringEncoding)encoding
// UniversalDetector detects CP949 but returns "EUC-KR" because CP949 lacks an IANA name.
// Kludge to make strings decode properly anyway.
if(cfenc==kCFStringEncodingEUC_KR) cfenc=kCFStringEncodingDOSKorean;
// Something similar happens with "Shift_JIS".
if(cfenc==kCFStringEncodingShiftJIS) cfenc=kCFStringEncodingDOSJapanese;

return CFStringConvertEncodingToNSStringEncoding(cfenc);
NSStringEncoding encoding = CFStringConvertEncodingToNSStringEncoding(cfenc);

if (possiblyMacRoman &&
(encoding == NSWindowsCP1252StringEncoding ||
encoding == NSShiftJISStringEncoding)) {
encoding = NSMacOSRomanStringEncoding;
}

return encoding;
}

-(float)confidence
Expand Down
27 changes: 22 additions & 5 deletions UniversalDetector.xcodeproj/project.pbxproj
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
3D4F0C3D11F42E3C00603CE3 /* WrappedUniversalDetector.h in Headers */ = {isa = PBXBuildFile; fileRef = 3D4F0C3B11F42E3C00603CE3 /* WrappedUniversalDetector.h */; };
3D724E301174182900CD3CBD /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 3D724E2F1174182900CD3CBD /* Foundation.framework */; };
3D724E4E11741AEE00CD3CBD /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 3D724E2F1174182900CD3CBD /* Foundation.framework */; };
3D81E2B318C5F92C00834BCA /* AppKit.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 3D81E2B218C5F92C00834BCA /* AppKit.framework */; };
3D8275CD16B943C100061FD9 /* nsMemory.h in Headers */ = {isa = PBXBuildFile; fileRef = 3D8275CB16B943C100061FD9 /* nsMemory.h */; };
3D8275D616B946F100061FD9 /* nsDebug.h in Headers */ = {isa = PBXBuildFile; fileRef = 3D8275D516B946F100061FD9 /* nsDebug.h */; };
/* End PBXBuildFile section */
Expand Down Expand Up @@ -122,6 +123,7 @@
3D4F0C3A11F42E3C00603CE3 /* WrappedUniversalDetector.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = WrappedUniversalDetector.cpp; sourceTree = "<group>"; };
3D4F0C3B11F42E3C00603CE3 /* WrappedUniversalDetector.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = WrappedUniversalDetector.h; sourceTree = "<group>"; };
3D724E2F1174182900CD3CBD /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; };
3D81E2B218C5F92C00834BCA /* AppKit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = AppKit.framework; path = System/Library/Frameworks/AppKit.framework; sourceTree = SDKROOT; };
3D8275CB16B943C100061FD9 /* nsMemory.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = nsMemory.h; sourceTree = "<group>"; };
3D8275D516B946F100061FD9 /* nsDebug.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = nsDebug.h; sourceTree = "<group>"; };
8DC2EF5A0486A6940098B216 /* Info.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist; path = Info.plist; sourceTree = "<group>"; };
Expand All @@ -133,6 +135,7 @@
isa = PBXFrameworksBuildPhase;
buildActionMask = 2147483647;
files = (
3D81E2B318C5F92C00834BCA /* AppKit.framework in Frameworks */,
3D724E4E11741AEE00CD3CBD /* Foundation.framework in Frameworks */,
1B0DDCB30A2D0B2E0009B697 /* UniversalDetector.framework in Frameworks */,
);
Expand Down Expand Up @@ -161,6 +164,7 @@
0867D691FE84028FC02AAC07 /* UniversalCharDet */ = {
isa = PBXGroup;
children = (
3D81E2B218C5F92C00834BCA /* AppKit.framework */,
08FB77AEFE84172EC02AAC07 /* Classes */,
3D8275C716B943C100061FD9 /* mozilla-release */,
1B0DDB110A2CB1B20009B697 /* universalchardet */,
Expand Down Expand Up @@ -417,7 +421,7 @@
0867D690FE84028FC02AAC07 /* Project object */ = {
isa = PBXProject;
attributes = {
LastUpgradeCheck = 0500;
LastUpgradeCheck = 0720;
};
buildConfigurationList = 1DEB91B108733DA50010E9CD /* Build configuration list for PBXProject "UniversalDetector" */;
compatibilityVersion = "Xcode 3.2";
Expand Down Expand Up @@ -558,8 +562,10 @@
GCC_PRECOMPILE_PREFIX_HEADER = YES;
GCC_PREFIX_HEADER = UniversalDetector_Prefix.pch;
INFOPLIST_FILE = Info.plist;
INSTALL_PATH = "@executable_path/../Frameworks";
INSTALL_PATH = "@rpath";
PRODUCT_BUNDLE_IDENTIFIER = org.mozilla.universalchardet;
PRODUCT_NAME = UniversalDetector;
SKIP_INSTALL = YES;
WRAPPER_EXTENSION = framework;
};
name = Debug;
Expand All @@ -574,8 +580,10 @@
GCC_PRECOMPILE_PREFIX_HEADER = YES;
GCC_PREFIX_HEADER = UniversalDetector_Prefix.pch;
INFOPLIST_FILE = Info.plist;
INSTALL_PATH = "@executable_path/../Frameworks";
INSTALL_PATH = "@rpath";
PRODUCT_BUNDLE_IDENTIFIER = org.mozilla.universalchardet;
PRODUCT_NAME = UniversalDetector;
SKIP_INSTALL = YES;
WRAPPER_EXTENSION = framework;
};
name = Release;
Expand All @@ -588,18 +596,23 @@
CLANG_WARN_EMPTY_BODY = YES;
CLANG_WARN_ENUM_CONVERSION = YES;
CLANG_WARN_INT_CONVERSION = YES;
CLANG_WARN_UNREACHABLE_CODE = YES;
CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
COMBINE_HIDPI_IMAGES = YES;
DEBUG_INFORMATION_FORMAT = dwarf;
ENABLE_STRICT_OBJC_MSGSEND = YES;
ENABLE_TESTABILITY = YES;
GCC_C_LANGUAGE_STANDARD = c99;
GCC_INLINES_ARE_PRIVATE_EXTERN = YES;
GCC_NO_COMMON_BLOCKS = YES;
GCC_VERSION = com.apple.compilers.llvm.clang.1_0;
GCC_WARN_64_TO_32_BIT_CONVERSION = NO;
GCC_WARN_ABOUT_RETURN_TYPE = YES;
GCC_WARN_UNDECLARED_SELECTOR = YES;
GCC_WARN_UNINITIALIZED_AUTOS = YES;
GCC_WARN_UNUSED_FUNCTION = YES;
GCC_WARN_UNUSED_VARIABLE = YES;
MACOSX_DEPLOYMENT_TARGET = 10.6;
MACOSX_DEPLOYMENT_TARGET = 10.9;
ONLY_ACTIVE_ARCH = YES;
SKIP_INSTALL = YES;
};
Expand All @@ -613,18 +626,22 @@
CLANG_WARN_EMPTY_BODY = YES;
CLANG_WARN_ENUM_CONVERSION = YES;
CLANG_WARN_INT_CONVERSION = YES;
CLANG_WARN_UNREACHABLE_CODE = YES;
CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
COMBINE_HIDPI_IMAGES = YES;
DEBUG_INFORMATION_FORMAT = dwarf;
ENABLE_STRICT_OBJC_MSGSEND = YES;
GCC_C_LANGUAGE_STANDARD = c99;
GCC_INLINES_ARE_PRIVATE_EXTERN = YES;
GCC_NO_COMMON_BLOCKS = YES;
GCC_VERSION = com.apple.compilers.llvm.clang.1_0;
GCC_WARN_64_TO_32_BIT_CONVERSION = NO;
GCC_WARN_ABOUT_RETURN_TYPE = YES;
GCC_WARN_UNDECLARED_SELECTOR = YES;
GCC_WARN_UNINITIALIZED_AUTOS = YES;
GCC_WARN_UNUSED_FUNCTION = YES;
GCC_WARN_UNUSED_VARIABLE = YES;
MACOSX_DEPLOYMENT_TARGET = 10.6;
MACOSX_DEPLOYMENT_TARGET = 10.9;
SKIP_INSTALL = YES;
};
name = Release;
Expand Down