-
Notifications
You must be signed in to change notification settings - Fork 9.9k
Modernize datadir #4372
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Modernize datadir #4372
Changes from all commits
c61f8f2
9251830
ddc32a4
1ea0ef2
bb9604f
b43fc23
b04ebd4
391972f
2a296fa
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -29,6 +29,7 @@ | |||||
#include "params.h" | ||||||
#include "stopper.h" | ||||||
#include "tesseractclass.h" | ||||||
#include "tesserrstream.h" // for tesserr | ||||||
#include "tessvars.h" | ||||||
#include "tprintf.h" | ||||||
#ifndef DISABLED_LEGACY_ENGINE | ||||||
|
@@ -43,24 +44,25 @@ namespace tesseract { | |||||
// Read a "config" file containing a set of variable, value pairs. | ||||||
// Searches the standard places: tessdata/configs, tessdata/tessconfigs | ||||||
// and also accepts a relative or absolute path name. | ||||||
void Tesseract::read_config_file(const char *filename, SetParamConstraint constraint) { | ||||||
std::string path = datadir; | ||||||
path += "configs/"; | ||||||
path += filename; | ||||||
FILE *fp; | ||||||
if ((fp = fopen(path.c_str(), "rb")) != nullptr) { | ||||||
fclose(fp); | ||||||
} else { | ||||||
path = datadir; | ||||||
path += "tessconfigs/"; | ||||||
path += filename; | ||||||
if ((fp = fopen(path.c_str(), "rb")) != nullptr) { | ||||||
fclose(fp); | ||||||
} else { | ||||||
path = filename; | ||||||
} | ||||||
} | ||||||
ParamUtils::ReadParamsFile(path.c_str(), constraint, this->params()); | ||||||
void Tesseract::read_config_file(const char *filename, | ||||||
SetParamConstraint constraint) { | ||||||
// Construct potential config file paths | ||||||
std::vector<std::filesystem::path> config_paths = { | ||||||
datadir / "configs" / filename, | ||||||
datadir / "tessconfigs" / filename, | ||||||
std::filesystem::path(filename)}; | ||||||
|
||||||
// Use the first existing file or fallback to the last (filename) | ||||||
auto config_file = std::find_if(config_paths.begin(), config_paths.end(), | ||||||
[](const std::filesystem::path &path) { | ||||||
std::error_code ec; | ||||||
return std::filesystem::exists(path, ec); | ||||||
}); | ||||||
const std::filesystem::path &selected_path = | ||||||
(config_file != config_paths.end()) ? *config_file : config_paths.back(); | ||||||
|
||||||
ParamUtils::ReadParamsFile(selected_path.string().c_str(), constraint, | ||||||
this->params()); | ||||||
} | ||||||
|
||||||
// Returns false if a unicharset file for the specified language was not found | ||||||
|
@@ -81,17 +83,14 @@ bool Tesseract::init_tesseract_lang_data(const std::string &arg0, | |||||
bool set_only_non_debug_params, TessdataManager *mgr) { | ||||||
// Set the language data path prefix | ||||||
lang = !language.empty() ? language : "eng"; | ||||||
language_data_path_prefix = datadir; | ||||||
language_data_path_prefix += lang; | ||||||
language_data_path_prefix += "."; | ||||||
language_data_path_prefix = datadir.string(); | ||||||
std::filesystem::path tessdata_path = datadir / (lang + "." + kTrainedDataSuffix); | ||||||
|
||||||
// Initialize TessdataManager. | ||||||
std::string tessdata_path = language_data_path_prefix + kTrainedDataSuffix; | ||||||
if (!mgr->is_loaded() && !mgr->Init(tessdata_path.c_str())) { | ||||||
tprintf("Error opening data file %s\n", tessdata_path.c_str()); | ||||||
tprintf( | ||||||
if (!mgr->is_loaded() && !mgr->Init(tessdata_path.string().c_str())) { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This change would broke msvc build: |
||||||
tesserr << "Error opening data file " << tessdata_path.string() << '\n' << | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Regarding the difference in path formatting between using
I believe it’s more user-friendly and intuitive to display the native OS path. Is there a specific reason we should use the escaped version instead? If not, I recommend sticking with the native path for error messages. |
||||||
"Please make sure the TESSDATA_PREFIX environment variable is set" | ||||||
" to your \"tessdata\" directory.\n"); | ||||||
" to your \"tessdata\" directory.\n"; | ||||||
return false; | ||||||
} | ||||||
#ifdef DISABLED_LEGACY_ENGINE | ||||||
|
@@ -184,10 +183,8 @@ bool Tesseract::init_tesseract_lang_data(const std::string &arg0, | |||||
} | ||||||
#ifndef DISABLED_LEGACY_ENGINE | ||||||
else if (!mgr->GetComponent(TESSDATA_UNICHARSET, &fp) || !unicharset.load_from_file(&fp, false)) { | ||||||
tprintf( | ||||||
"Error: Tesseract (legacy) engine requested, but components are " | ||||||
"not present in %s!!\n", | ||||||
tessdata_path.c_str()); | ||||||
tesserr << "Error: Tesseract (legacy) engine requested, but components are " | ||||||
"not present in " << tessdata_path.string() << "!!\n"; | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The same as above: I recommend sticking with the native path for error messages |
||||||
return false; | ||||||
} | ||||||
#endif // ndef DISABLED_LEGACY_ENGINE | ||||||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -11,11 +11,10 @@ | |||||
// limitations under the License. | ||||||
|
||||||
#include "ccutil.h" | ||||||
#include "tesserrstream.h" // for tesserr | ||||||
#include "tprintf.h" // for tprintf | ||||||
|
||||||
#include <cstdlib> | ||||||
#include <cstring> // for std::strrchrA | ||||||
#include <filesystem> // for std::filesystem | ||||||
|
||||||
namespace tesseract { | ||||||
|
||||||
|
@@ -33,68 +32,72 @@ CCUtil::CCUtil() | |||||
CCUtil::~CCUtil() = default; | ||||||
|
||||||
/** | ||||||
* @brief CCUtil::main_setup - set location of tessdata and name of image | ||||||
* @brief Finds the path to the tessdata directory. | ||||||
* | ||||||
* @param argv0 - paths to the directory with language files and config files. | ||||||
* An actual value of argv0 is used if not nullptr, otherwise TESSDATA_PREFIX is | ||||||
* used if not nullptr, next try to use compiled in -DTESSDATA_PREFIX. If | ||||||
* previous is not successful - use current directory. | ||||||
* @param basename - name of image | ||||||
* This function determines the location of the tessdata directory based on the | ||||||
* following order of precedence: | ||||||
* 1. If `argv0` is provided, use it. | ||||||
* 2. If `TESSDATA_PREFIX` environment variable is set and the path exists, use | ||||||
* it. | ||||||
* 3. On Windows, check for a "tessdata" directory in the executable's directory | ||||||
* and use it. | ||||||
* 4. If `TESSDATA_PREFIX` is defined at compile time, use it. | ||||||
* 5. Otherwise, use the current working directory. | ||||||
* | ||||||
* @param argv0 argument to be considered as the data directory path. | ||||||
* @return The path to the tessdata directory or current directory. | ||||||
*/ | ||||||
void CCUtil::main_setup(const std::string &argv0, const std::string &basename) { | ||||||
imagebasename = basename; /**< name of image */ | ||||||
|
||||||
const char *tessdata_prefix = getenv("TESSDATA_PREFIX"); | ||||||
|
||||||
// Ignore TESSDATA_PREFIX if there is no matching filesystem entry. | ||||||
if (tessdata_prefix != nullptr && !std::filesystem::exists(tessdata_prefix)) { | ||||||
tprintf("Warning: TESSDATA_PREFIX %s does not exist, ignore it\n", tessdata_prefix); | ||||||
tessdata_prefix = nullptr; | ||||||
} | ||||||
|
||||||
static std::filesystem::path find_data_path(const std::string &argv0) { | ||||||
// If argv0 is set, always use it even if it is not a valid directory | ||||||
if (!argv0.empty()) { | ||||||
/* Use tessdata prefix from the command line. */ | ||||||
datadir = argv0; | ||||||
} else if (tessdata_prefix) { | ||||||
/* Use tessdata prefix from the environment. */ | ||||||
datadir = tessdata_prefix; | ||||||
#if defined(_WIN32) | ||||||
} else if (datadir.empty() || !std::filesystem::exists(datadir)) { | ||||||
/* Look for tessdata in directory of executable. */ | ||||||
char path[_MAX_PATH]; | ||||||
DWORD length = GetModuleFileName(nullptr, path, sizeof(path)); | ||||||
if (length > 0 && length < sizeof(path)) { | ||||||
char *separator = std::strrchr(path, '\\'); | ||||||
if (separator != nullptr) { | ||||||
*separator = '\0'; | ||||||
std::string subdir = path; | ||||||
subdir += "/tessdata"; | ||||||
if (std::filesystem::exists(subdir)) { | ||||||
datadir = subdir; | ||||||
} | ||||||
} | ||||||
std::filesystem::path path(argv0); | ||||||
if (!std::filesystem::is_directory(path)) { | ||||||
tesserr << "Warning (tessdata): '" << argv0 << "' is not a valid directory.\n"; | ||||||
} | ||||||
#endif /* _WIN32 */ | ||||||
return path; | ||||||
} | ||||||
|
||||||
// datadir may still be empty: | ||||||
if (datadir.empty()) { | ||||||
#if defined(TESSDATA_PREFIX) | ||||||
// Use tessdata prefix which was compiled in. | ||||||
datadir = TESSDATA_PREFIX "/tessdata/"; | ||||||
// Note that some software (for example conda) patches TESSDATA_PREFIX | ||||||
// in the binary, so it might be shorter. Recalculate its length. | ||||||
datadir.resize(std::strlen(datadir.c_str())); | ||||||
#else | ||||||
datadir = "./"; | ||||||
#endif /* TESSDATA_PREFIX */ | ||||||
// Check environment variable if argv0 is not specified | ||||||
if (const char *tessdata_prefix = std::getenv("TESSDATA_PREFIX")) { | ||||||
std::filesystem::path path(tessdata_prefix); | ||||||
if (std::filesystem::exists(path)) { | ||||||
return path; | ||||||
} else { | ||||||
tprintf("Warning: TESSDATA_PREFIX %s does not exist, ignoring.\n", | ||||||
tessdata_prefix); | ||||||
} | ||||||
} | ||||||
|
||||||
// check for missing directory separator | ||||||
const char lastchar = datadir.back(); | ||||||
if (lastchar != '/' && lastchar != '\\') { | ||||||
datadir += '/'; | ||||||
#ifdef _WIN32 | ||||||
// Windows-specific: check for 'tessdata' not existing in the executable | ||||||
// directory | ||||||
wchar_t path[MAX_PATH]; | ||||||
if (DWORD length = GetModuleFileNameW(nullptr, path, MAX_PATH); | ||||||
length > 0 && length < MAX_PATH) { | ||||||
std::filesystem::path exe_path(path); | ||||||
auto tessdata_subdir = exe_path.parent_path() / "tessdata"; | ||||||
if (std::filesystem::exists(tessdata_subdir)) { | ||||||
return tessdata_subdir; | ||||||
} | ||||||
} | ||||||
#endif | ||||||
|
||||||
// Fallback to compile-time or current directory | ||||||
#ifdef TESSDATA_PREFIX | ||||||
return std::filesystem::path(TESSDATA_PREFIX) / "tessdata"; | ||||||
#else | ||||||
return std::filesystem::current_path(); | ||||||
#endif | ||||||
} | ||||||
|
||||||
|
||||||
/** | ||||||
* @brief CCUtil::main_setup - set location of tessdata and name of image | ||||||
* | ||||||
* @param argv0 - paths to the directory with language files and config files. | ||||||
*/ | ||||||
void CCUtil::main_setup(const std::string &argv0, const std::string &basename) { | ||||||
imagebasename = basename; /**< name of image */ | ||||||
datadir = find_data_path(argv0); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The old There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Previously, |
||||||
} | ||||||
} // namespace tesseract |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
language_data_path_prefix
is a class variable which is no longer set in the new code. This causes the CI failures.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks. Fixed 2a296fa