summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'tesseract/src/dict/dict.h')
-rw-r--r--tesseract/src/dict/dict.h651
1 files changed, 651 insertions, 0 deletions
diff --git a/tesseract/src/dict/dict.h b/tesseract/src/dict/dict.h
new file mode 100644
index 00000000..e8ec2e37
--- /dev/null
+++ b/tesseract/src/dict/dict.h
@@ -0,0 +1,651 @@
+///////////////////////////////////////////////////////////////////////
+// File: dict.h
+// Description: dict class.
+// Author: Samuel Charron
+//
+// (C) Copyright 2006, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_DICT_DICT_H_
+#define TESSERACT_DICT_DICT_H_
+
+#ifdef HAVE_CONFIG_H
+#include "config_auto.h" // DISABLED_LEGACY_ENGINE
+#endif
+
+#ifndef DISABLED_LEGACY_ENGINE
+#include "ambigs.h"
+#endif
+#include "dawg.h"
+#include "dawg_cache.h"
+#include "ratngs.h"
+#include "stopper.h"
+#include "trie.h"
+#include "unicharset.h"
+#ifndef DISABLED_LEGACY_ENGINE
+#include "params_training_featdef.h"
+#endif // ndef DISABLED_LEGACY_ENGINE
+
+namespace tesseract {
+
+class MATRIX;
+class WERD_RES;
+
+#define CHARS_PER_LINE 500
+#define MAX_WERD_LENGTH (int64_t) 128
+#define NO_RATING -1
+
+/** Struct used to hold temporary information about fragments. */
+struct CHAR_FRAGMENT_INFO {
+ UNICHAR_ID unichar_id;
+ const CHAR_FRAGMENT *fragment;
+ int num_fragments;
+ float rating;
+ float certainty;
+};
+
+using DawgVector = GenericVector<Dawg *>;
+
+//
+// Constants
+//
+static const int kRatingPad = 4;
+static const int kDictMaxWildcards = 2; // max wildcards for a word
+// TODO(daria): If hyphens are different in different languages and can be
+// inferred from training data we should load their values dynamically.
+static const char kHyphenSymbol[] = "-";
+static const char kSlashSymbol[] = "/";
+static const char kQuestionSymbol[] = "?";
+static const char kApostropheSymbol[] = "'";
+static const float kSimCertaintyScale = -10.0; // similarity matcher scaling
+static const float kSimCertaintyOffset = -10.0; // similarity matcher offset
+static const float kSimilarityFloor = 100.0; // worst E*L product to stop on
+static const int kDocDictMaxRepChars = 4;
+
+// Enum for describing whether the x-height for the word is consistent:
+// 0 - everything is good.
+// 1 - there are one or two secondary (but consistent) baselines
+// [think subscript and superscript], or there is an oversized
+// first character.
+// 2 - the word is inconsistent.
+enum XHeightConsistencyEnum {XH_GOOD, XH_SUBNORMAL, XH_INCONSISTENT};
+
+struct DawgArgs {
+ DawgArgs(DawgPositionVector *d, DawgPositionVector *up, PermuterType p)
+ : active_dawgs(d), updated_dawgs(up), permuter(p), valid_end(false) {}
+
+ DawgPositionVector *active_dawgs;
+ DawgPositionVector *updated_dawgs;
+ PermuterType permuter;
+ // True if the current position is a valid word end.
+ bool valid_end;
+};
+
+class TESS_API Dict {
+ public:
+ Dict(CCUtil* image_ptr);
+ ~Dict();
+ const CCUtil* getCCUtil() const {
+ return ccutil_;
+ }
+ CCUtil* getCCUtil() {
+ return ccutil_;
+ }
+ const UNICHARSET& getUnicharset() const {
+ return getCCUtil()->unicharset;
+ }
+ UNICHARSET& getUnicharset() {
+ return getCCUtil()->unicharset;
+ }
+#ifndef DISABLED_LEGACY_ENGINE
+ const UnicharAmbigs &getUnicharAmbigs() const {
+ return getCCUtil()->unichar_ambigs;
+ }
+#endif
+ // Returns true if unichar_id is a word compounding character like - or /.
+ inline bool compound_marker(UNICHAR_ID unichar_id) {
+ const UNICHARSET& unicharset = getUnicharset();
+ ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
+ const auto &normed_ids =
+ unicharset.normed_ids(unichar_id);
+ return normed_ids.size() == 1 &&
+ (normed_ids[0] == hyphen_unichar_id_ ||
+ normed_ids[0] == slash_unichar_id_);
+ }
+ // Returns true if unichar_id is an apostrophe-like character that may
+ // separate prefix/suffix words from a main body word.
+ inline bool is_apostrophe(UNICHAR_ID unichar_id) {
+ const UNICHARSET& unicharset = getUnicharset();
+ ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
+ const auto &normed_ids =
+ unicharset.normed_ids(unichar_id);
+ return normed_ids.size() == 1 && normed_ids[0] == apostrophe_unichar_id_;
+ }
+
+ /* hyphen.cpp ************************************************************/
+
+ /// Returns true if we've recorded the beginning of a hyphenated word.
+ inline bool hyphenated() const { return
+ !last_word_on_line_ && hyphen_word_;
+ }
+ /// Size of the base word (the part on the line before) of a hyphenated word.
+ inline int hyphen_base_size() const {
+ return this->hyphenated() ? hyphen_word_->length() : 0;
+ }
+ /// If this word is hyphenated copy the base word (the part on
+ /// the line before) of a hyphenated word into the given word.
+ /// This function assumes that word is not nullptr.
+ inline void copy_hyphen_info(WERD_CHOICE *word) const {
+ if (this->hyphenated()) {
+ *word = *hyphen_word_;
+ if (hyphen_debug_level) word->print("copy_hyphen_info: ");
+ }
+ }
+ /// Check whether the word has a hyphen at the end.
+ inline bool has_hyphen_end(const UNICHARSET* unicharset,
+ UNICHAR_ID unichar_id, bool first_pos) const {
+ if (!last_word_on_line_ || first_pos)
+ return false;
+ ASSERT_HOST(unicharset->contains_unichar_id(unichar_id));
+ const auto &normed_ids =
+ unicharset->normed_ids(unichar_id);
+ return normed_ids.size() == 1 && normed_ids[0] == hyphen_unichar_id_;
+ }
+ /// Same as above, but check the unichar at the end of the word.
+ inline bool has_hyphen_end(const WERD_CHOICE &word) const {
+ int word_index = word.length() - 1;
+ return has_hyphen_end(word.unicharset(), word.unichar_id(word_index),
+ word_index == 0);
+ }
+ /// Unless the previous word was the last one on the line, and the current
+ /// one is not (thus it is the first one on the line), erase hyphen_word_,
+ /// clear hyphen_active_dawgs_, update last_word_on_line_.
+ void reset_hyphen_vars(bool last_word_on_line);
+ /// Update hyphen_word_, and copy the given DawgPositionVectors into
+ /// hyphen_active_dawgs_ .
+ void set_hyphen_word(const WERD_CHOICE &word,
+ const DawgPositionVector &active_dawgs);
+
+ /* permdawg.cpp ************************************************************/
+ // Note: Functions in permdawg.cpp are only used by NoDangerousAmbig().
+ // When this function is refactored, permdawg.cpp can be removed.
+
+ /// Copies word into best_choice if its rating is smaller
+ /// than that of best_choice.
+ inline void update_best_choice(const WERD_CHOICE &word,
+ WERD_CHOICE *best_choice) {
+ if (word.rating() < best_choice->rating()) {
+ *best_choice = word;
+ }
+ }
+ /// Fill the given active_dawgs vector with dawgs that could contain the
+ /// beginning of the word. If hyphenated() returns true, copy the entries
+ /// from hyphen_active_dawgs_ instead.
+ void init_active_dawgs(DawgPositionVector *active_dawgs,
+ bool ambigs_mode) const;
+ // Fill the given vector with the default collection of any-length dawgs
+ void default_dawgs(DawgPositionVector *anylength_dawgs,
+ bool suppress_patterns) const;
+
+
+ /// Recursively explore all the possible character combinations in
+ /// the given char_choices. Use go_deeper_dawg_fxn() to explore all the
+ /// dawgs in the dawgs_ vector in parallel and discard invalid words.
+ ///
+ /// Allocate and return a WERD_CHOICE with the best valid word found.
+ WERD_CHOICE *dawg_permute_and_select(
+ const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit);
+ /// If the choice being composed so far could be a dictionary word
+ /// and we have not reached the end of the word keep exploring the
+ /// char_choices further.
+ void go_deeper_dawg_fxn(
+ const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
+ int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
+ bool word_ending, WERD_CHOICE *word, float certainties[],
+ float *limit, WERD_CHOICE *best_choice, int *attempts_left,
+ void *void_more_args);
+
+ /// Pointer to go_deeper function.
+ void (Dict::*go_deeper_fxn_)(const char *debug,
+ const BLOB_CHOICE_LIST_VECTOR &char_choices,
+ int char_choice_index,
+ const CHAR_FRAGMENT_INFO *prev_char_frag_info,
+ bool word_ending, WERD_CHOICE *word,
+ float certainties[], float *limit,
+ WERD_CHOICE *best_choice, int *attempts_left,
+ void *void_more_args);
+ //
+ // Helper functions for dawg_permute_and_select().
+ //
+ void permute_choices(
+ const char *debug,
+ const BLOB_CHOICE_LIST_VECTOR &char_choices,
+ int char_choice_index,
+ const CHAR_FRAGMENT_INFO *prev_char_frag_info,
+ WERD_CHOICE *word,
+ float certainties[],
+ float *limit,
+ WERD_CHOICE *best_choice,
+ int *attempts_left,
+ void *more_args);
+
+ void append_choices(
+ const char *debug,
+ const BLOB_CHOICE_LIST_VECTOR &char_choices,
+ const BLOB_CHOICE &blob_choice,
+ int char_choice_index,
+ const CHAR_FRAGMENT_INFO *prev_char_frag_info,
+ WERD_CHOICE *word,
+ float certainties[],
+ float *limit,
+ WERD_CHOICE *best_choice,
+ int *attempts_left,
+ void *more_args);
+
+ bool fragment_state_okay(UNICHAR_ID curr_unichar_id,
+ float curr_rating, float curr_certainty,
+ const CHAR_FRAGMENT_INFO *prev_char_frag_info,
+ const char *debug, int word_ending,
+ CHAR_FRAGMENT_INFO *char_frag_info);
+
+ /* stopper.cpp *************************************************************/
+#if !defined(DISABLED_LEGACY_ENGINE)
+ bool NoDangerousAmbig(WERD_CHOICE *BestChoice,
+ DANGERR *fixpt,
+ bool fix_replaceable,
+ MATRIX* ratings);
+#endif // !defined(DISABLED_LEGACY_ENGINE)
+ // Replaces the corresponding wrong ngram in werd_choice with the correct
+ // one. The whole correct n-gram is inserted into the ratings matrix and
+ // the werd_choice: no more fragments!. Rating and certainty of new entries
+ // in matrix and werd_choice are the sum and mean of the wrong ngram
+ // respectively.
+ // E.g. for werd_choice mystring'' and ambiguity ''->": werd_choice becomes
+ // mystring", with a new entry in the ratings matrix for ".
+ void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
+ UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice,
+ MATRIX *ratings);
+
+ /// Returns the length of the shortest alpha run in WordChoice.
+ int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const;
+ /// Returns true if the certainty of the BestChoice word is within a
+ /// reasonable range of the average certainties for the best choices for
+ /// each character in the segmentation. This test is used to catch words
+ /// in which one character is much worse than the other characters in the
+ /// word (i.e. false will be returned in that case). The algorithm computes
+ /// the mean and std deviation of the certainties in the word with the worst
+ /// certainty thrown out.
+ int UniformCertainties(const WERD_CHOICE& word);
+ /// Returns true if the given best_choice is good enough to stop.
+ bool AcceptableChoice(const WERD_CHOICE& best_choice,
+ XHeightConsistencyEnum xheight_consistency);
+ /// Returns false if the best choice for the current word is questionable
+ /// and should be tried again on the second pass or should be flagged to
+ /// the user.
+ bool AcceptableResult(WERD_RES *word) const;
+#if !defined(DISABLED_LEGACY_ENGINE)
+ void EndDangerousAmbigs();
+#endif // !defined(DISABLED_LEGACY_ENGINE)
+ /// Prints the current choices for this word to stdout.
+ void DebugWordChoices();
+ /// Sets up stopper variables in preparation for the first pass.
+ void SettupStopperPass1();
+ /// Sets up stopper variables in preparation for the second pass.
+ void SettupStopperPass2();
+ /* context.cpp *************************************************************/
+ /// Check a string to see if it matches a set of lexical rules.
+ int case_ok(const WERD_CHOICE& word) const;
+ /// Returns true if the word looks like an absolute garbage
+ /// (e.g. image mistakenly recognized as text).
+ bool absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset);
+
+ /* dict.cpp ****************************************************************/
+
+ /// Initialize Dict class - load dawgs from [lang].traineddata and
+ /// user-specified wordlist and parttern list.
+ static DawgCache *GlobalDawgCache();
+ // Sets up ready for a Load or LoadLSTM.
+ void SetupForLoad(DawgCache *dawg_cache);
+ // Loads the dawgs needed by Tesseract. Call FinishLoad() after.
+ void Load(const STRING &lang, TessdataManager *data_file);
+ // Loads the dawgs needed by the LSTM model. Call FinishLoad() after.
+ void LoadLSTM(const STRING &lang, TessdataManager *data_file);
+ // Completes the loading process after Load() and/or LoadLSTM().
+ // Returns false if no dictionaries were loaded.
+ bool FinishLoad();
+ void End();
+
+ // Resets the document dictionary analogous to ResetAdaptiveClassifier.
+ void ResetDocumentDictionary() {
+ if (pending_words_ != nullptr)
+ pending_words_->clear();
+ if (document_words_ != nullptr)
+ document_words_->clear();
+ }
+
+ /**
+ * Returns the maximal permuter code (from ccstruct/ratngs.h) if in light
+ * of the current state the letter at word_index in the given word
+ * is allowed according to at least one of the dawgs in dawgs_,
+ * otherwise returns NO_PERM.
+ *
+ * The state is described by void_dawg_args, which are interpreted as
+ * DawgArgs and contain relevant active dawg positions.
+ * Each entry in the active_dawgs vector contains an index
+ * into the dawgs_ vector and an EDGE_REF that indicates the last edge
+ * followed in the dawg. It also may contain a position in the punctuation
+ * dawg which describes surrounding punctuation (see struct DawgPosition).
+ *
+ * Input:
+ * At word_index 0 dawg_args->active_dawgs should contain an entry for each
+ * dawg that may start at the beginning of a word, with punc_ref and edge_ref
+ * initialized to NO_EDGE. Since the punctuation dawg includes the empty
+ * pattern " " (meaning anything without surrounding punctuation), having a
+ * single entry for the punctuation dawg will cover all dawgs reachable
+ * therefrom -- that includes all number and word dawgs. The only dawg
+ * non-reachable from the punctuation_dawg is the pattern dawg.
+ * If hyphen state needs to be applied, initial dawg_args->active_dawgs can
+ * be copied from the saved hyphen state (maintained by Dict).
+ * For word_index > 0 the corresponding state (active_dawgs and punc position)
+ * can be obtained from dawg_args->updated_dawgs passed to
+ * def_letter_is_okay for word_index-1.
+ * Note: the function assumes that active_dawgs, and updated_dawgs
+ * member variables of dawg_args are not nullptr.
+ *
+ * Output:
+ * The function fills in dawg_args->updated_dawgs vector with the
+ * entries for dawgs that contain the word up to the letter at word_index.
+ *
+ */
+
+ //
+ int def_letter_is_okay(void* void_dawg_args, const UNICHARSET& unicharset,
+ UNICHAR_ID unichar_id, bool word_end) const;
+
+ int (Dict::*letter_is_okay_)(void* void_dawg_args,
+ const UNICHARSET& unicharset,
+ UNICHAR_ID unichar_id, bool word_end) const;
+ /// Calls letter_is_okay_ member function.
+ int LetterIsOkay(void* void_dawg_args, const UNICHARSET& unicharset,
+ UNICHAR_ID unichar_id, bool word_end) const {
+ return (this->*letter_is_okay_)(void_dawg_args,
+ unicharset, unichar_id, word_end);
+ }
+
+
+ /// Probability in context function used by the ngram permuter.
+ double (Dict::*probability_in_context_)(const char* lang,
+ const char* context,
+ int context_bytes,
+ const char* character,
+ int character_bytes);
+ /// Calls probability_in_context_ member function.
+ double ProbabilityInContext(const char* context,
+ int context_bytes,
+ const char* character,
+ int character_bytes) {
+ return (this->*probability_in_context_)(
+ getCCUtil()->lang.c_str(),
+ context, context_bytes,
+ character, character_bytes);
+ }
+
+ /// Default (no-op) implementation of probability in context function.
+ double def_probability_in_context(
+ const char* lang, const char* context, int context_bytes,
+ const char* character, int character_bytes) {
+ (void)lang;
+ (void)context;
+ (void)context_bytes;
+ (void)character;
+ (void)character_bytes;
+ return 0.0;
+ }
+
+ inline void SetWildcardID(UNICHAR_ID id) { wildcard_unichar_id_ = id; }
+ inline UNICHAR_ID WildcardID() const { return wildcard_unichar_id_; }
+ /// Return the number of dawgs in the dawgs_ vector.
+ inline int NumDawgs() const { return dawgs_.size(); }
+ /// Return i-th dawg pointer recorded in the dawgs_ vector.
+ inline const Dawg *GetDawg(int index) const { return dawgs_[index]; }
+ /// Return the points to the punctuation dawg.
+ inline const Dawg *GetPuncDawg() const { return punc_dawg_; }
+ /// Return the points to the unambiguous words dawg.
+ inline const Dawg *GetUnambigDawg() const { return unambig_dawg_; }
+ /// Returns the appropriate next node given the EDGE_REF.
+ static inline NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref) {
+ if (edge_ref == NO_EDGE) return 0; // beginning to explore the dawg
+ NODE_REF node = dawg->next_node(edge_ref);
+ if (node == 0) node = NO_EDGE; // end of word
+ return node;
+ }
+
+ // Given a unichar from a string and a given dawg, return the unichar
+ // we should use to match in that dawg type. (for example, in the number
+ // dawg, all numbers are transformed to kPatternUnicharId).
+ UNICHAR_ID char_for_dawg(const UNICHARSET& unicharset, UNICHAR_ID ch,
+ const Dawg *dawg) const {
+ if (!dawg) return ch;
+ switch (dawg->type()) {
+ case DAWG_TYPE_NUMBER:
+ return unicharset.get_isdigit(ch) ? Dawg::kPatternUnicharID : ch;
+ default:
+ return ch;
+ }
+ }
+
+ /// For each of the character classes of the given unichar_id (and the
+ /// unichar_id itself) finds the corresponding outgoing node or self-loop
+ /// in the given dawg and (after checking that it is valid) records it in
+ /// dawg_args->updated_ative_dawgs. Updates current_permuter if any valid
+ /// edges were found.
+ void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info,
+ UNICHAR_ID unichar_id, bool word_end,
+ DawgArgs *dawg_args,
+ PermuterType *current_permuter) const;
+
+ /// Read/Write/Access special purpose dawgs which contain words
+ /// only of a certain length (used for phrase search for
+ /// non-space-delimited languages).
+
+ /// Check all the DAWGs to see if this word is in any of them.
+ inline static bool valid_word_permuter(uint8_t perm, bool numbers_ok) {
+ return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM ||
+ perm == DOC_DAWG_PERM || perm == USER_DAWG_PERM ||
+ perm == USER_PATTERN_PERM || perm == COMPOUND_PERM ||
+ (numbers_ok && perm == NUMBER_PERM));
+ }
+ int valid_word(const WERD_CHOICE &word, bool numbers_ok) const;
+ int valid_word(const WERD_CHOICE &word) const {
+ return valid_word(word, false); // return NO_PERM for words with digits
+ }
+ int valid_word_or_number(const WERD_CHOICE &word) const {
+ return valid_word(word, true); // return NUMBER_PERM for valid numbers
+ }
+ /// This function is used by api/tesseract_cube_combiner.cpp
+ int valid_word(const char *string) const {
+ WERD_CHOICE word(string, getUnicharset());
+ return valid_word(word);
+ }
+ // Do the two WERD_CHOICEs form a meaningful bigram?
+ bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const;
+ /// Returns true if the word contains a valid punctuation pattern.
+ /// Note: Since the domains of punctuation symbols and symblos
+ /// used in numbers are not disjoint, a valid number might contain
+ /// an invalid punctuation pattern (e.g. .99).
+ bool valid_punctuation(const WERD_CHOICE &word);
+ /// Returns true if a good answer is found for the unknown blob rating.
+ int good_choice(const WERD_CHOICE &choice);
+ /// Adds a word found on this document to the document specific dictionary.
+ void add_document_word(const WERD_CHOICE &best_choice);
+ /// Adjusts the rating of the given word.
+ void adjust_word(WERD_CHOICE *word,
+ bool nonword, XHeightConsistencyEnum xheight_consistency,
+ float additional_adjust,
+ bool modify_rating,
+ bool debug);
+ /// Set wordseg_rating_adjust_factor_ to the given value.
+ inline void SetWordsegRatingAdjustFactor(float f) {
+ wordseg_rating_adjust_factor_ = f;
+ }
+ /// Returns true if the language is space-delimited (not CJ, or T).
+ bool IsSpaceDelimitedLang() const;
+
+ private:
+ /** Private member variables. */
+ CCUtil* ccutil_;
+ /**
+ * Table that stores ambiguities computed during training
+ * (loaded when NoDangerousAmbigs() is called for the first time).
+ * Each entry i in the table stores a set of amibiguities whose
+ * wrong ngram starts with unichar id i.
+ */
+#ifndef DISABLED_LEGACY_ENGINE
+ UnicharAmbigs* dang_ambigs_table_ = nullptr;
+ /** Same as above, but for ambiguities with replace flag set. */
+ UnicharAmbigs* replace_ambigs_table_ = nullptr;
+#endif
+ /** Additional certainty padding allowed before a word is rejected. */
+ float reject_offset_;
+ // Cached UNICHAR_IDs:
+ UNICHAR_ID wildcard_unichar_id_; // kDictWildcard.
+ UNICHAR_ID apostrophe_unichar_id_; // kApostropheSymbol.
+ UNICHAR_ID question_unichar_id_; // kQuestionSymbol.
+ UNICHAR_ID slash_unichar_id_; // kSlashSymbol.
+ UNICHAR_ID hyphen_unichar_id_; // kHyphenSymbol.
+ // Hyphen-related variables.
+ WERD_CHOICE *hyphen_word_;
+ DawgPositionVector hyphen_active_dawgs_;
+ bool last_word_on_line_;
+ // List of lists of "equivalent" UNICHAR_IDs for the purposes of dictionary
+ // matching. The first member of each list is taken as canonical. For
+ // example, the first list contains hyphens and dashes with the first symbol
+ // being the ASCII hyphen minus.
+ std::vector<GenericVector<UNICHAR_ID> > equivalent_symbols_;
+ // Dawg Cache reference - this is who we ask to allocate/deallocate dawgs.
+ DawgCache *dawg_cache_;
+ bool dawg_cache_is_ours_; // we should delete our own dawg_cache_
+ // Dawgs.
+ DawgVector dawgs_;
+ SuccessorListsVector successors_;
+ Trie *pending_words_;
+ /// The following pointers are only cached for convenience.
+ /// The dawgs will be deleted when dawgs_ vector is destroyed.
+ // bigram_dawg_ points to a dawg of two-word bigrams which always supersede if
+ // any of them are present on the best choices list for a word pair.
+ // the bigrams are stored as space-separated words where:
+ // (1) leading and trailing punctuation has been removed from each word and
+ // (2) any digits have been replaced with '?' marks.
+ Dawg *bigram_dawg_;
+ // TODO(daria): need to support multiple languages in the future,
+ // so maybe will need to maintain a list of dawgs of each kind.
+ Dawg *freq_dawg_;
+ Dawg *unambig_dawg_;
+ Dawg *punc_dawg_;
+ Trie *document_words_;
+ /// Current segmentation cost adjust factor for word rating.
+ /// See comments in incorporate_segcost.
+ float wordseg_rating_adjust_factor_;
+ // File for recording ambiguities discovered during dictionary search.
+ FILE *output_ambig_words_file_;
+
+ public:
+ /// Variable members.
+ /// These have to be declared and initialized after image_ptr_, which contains
+ /// the pointer to the params vector - the member of its base CCUtil class.
+ STRING_VAR_H(user_words_file, "", "A filename of user-provided words.");
+ STRING_VAR_H(user_words_suffix, "",
+ "A suffix of user-provided words located in tessdata.");
+ STRING_VAR_H(user_patterns_file, "",
+ "A filename of user-provided patterns.");
+ STRING_VAR_H(user_patterns_suffix, "",
+ "A suffix of user-provided patterns located in tessdata.");
+ BOOL_VAR_H(load_system_dawg, true, "Load system word dawg.");
+ BOOL_VAR_H(load_freq_dawg, true, "Load frequent word dawg.");
+ BOOL_VAR_H(load_unambig_dawg, true, "Load unambiguous word dawg.");
+ BOOL_VAR_H(load_punc_dawg, true,
+ "Load dawg with punctuation patterns.");
+ BOOL_VAR_H(load_number_dawg, true, "Load dawg with number patterns.");
+ BOOL_VAR_H(load_bigram_dawg, true,
+ "Load dawg with special word bigrams.");
+ double_VAR_H(xheight_penalty_subscripts, 0.125,
+ "Score penalty (0.1 = 10%) added if there are subscripts "
+ "or superscripts in a word, but it is otherwise OK.");
+ double_VAR_H(xheight_penalty_inconsistent, 0.25,
+ "Score penalty (0.1 = 10%) added if an xheight is "
+ "inconsistent.");
+ double_VAR_H(segment_penalty_dict_frequent_word, 1.0,
+ "Score multiplier for word matches which have good case and"
+ "are frequent in the given language (lower is better).");
+
+ double_VAR_H(segment_penalty_dict_case_ok, 1.1,
+ "Score multiplier for word matches that have good case "
+ "(lower is better).");
+
+ double_VAR_H(segment_penalty_dict_case_bad, 1.3125,
+ "Default score multiplier for word matches, which may have "
+ "case issues (lower is better).");
+
+ double_VAR_H(segment_penalty_dict_nonword, 1.25,
+ "Score multiplier for glyph fragment segmentations which "
+ "do not match a dictionary word (lower is better).");
+
+ double_VAR_H(segment_penalty_garbage, 1.50,
+ "Score multiplier for poorly cased strings that are not in"
+ " the dictionary and generally look like garbage (lower is"
+ " better).");
+ STRING_VAR_H(output_ambig_words_file, "",
+ "Output file for ambiguities found in the dictionary");
+ INT_VAR_H(dawg_debug_level, 0, "Set to 1 for general debug info"
+ ", to 2 for more details, to 3 to see all the debug messages");
+ INT_VAR_H(hyphen_debug_level, 0, "Debug level for hyphenated words.");
+ BOOL_VAR_H(use_only_first_uft8_step, false,
+ "Use only the first UTF8 step of the given string"
+ " when computing log probabilities.");
+ double_VAR_H(certainty_scale, 20.0, "Certainty scaling factor");
+ double_VAR_H(stopper_nondict_certainty_base, -2.50,
+ "Certainty threshold for non-dict words");
+ double_VAR_H(stopper_phase2_certainty_rejection_offset, 1.0,
+ "Reject certainty offset");
+ INT_VAR_H(stopper_smallword_size, 2,
+ "Size of dict word to be treated as non-dict word");
+ double_VAR_H(stopper_certainty_per_char, -0.50,
+ "Certainty to add for each dict char above small word size.");
+ double_VAR_H(stopper_allowable_character_badness, 3.0,
+ "Max certaintly variation allowed in a word (in sigma)");
+ INT_VAR_H(stopper_debug_level, 0, "Stopper debug level");
+ BOOL_VAR_H(stopper_no_acceptable_choices, false,
+ "Make AcceptableChoice() always return false. Useful"
+ " when there is a need to explore all segmentations");
+ INT_VAR_H(tessedit_truncate_wordchoice_log, 10, "Max words to keep in list");
+ STRING_VAR_H(word_to_debug, "", "Word for which stopper debug information"
+ " should be printed to stdout");
+ BOOL_VAR_H(segment_nonalphabetic_script, false,
+ "Don't use any alphabetic-specific tricks."
+ "Set to true in the traineddata config file for"
+ " scripts that are cursive or inherently fixed-pitch");
+ BOOL_VAR_H(save_doc_words, 0, "Save Document Words");
+ double_VAR_H(doc_dict_pending_threshold, 0.0,
+ "Worst certainty for using pending dictionary");
+ double_VAR_H(doc_dict_certainty_threshold, -2.25, "Worst certainty"
+ " for words that can be inserted into the document dictionary");
+ INT_VAR_H(max_permuter_attempts, 10000, "Maximum number of different"
+ " character choices to consider during permutation."
+ " This limit is especially useful when user patterns"
+ " are specified, since overly generic patterns can result in"
+ " dawg search exploring an overly large number of options.");
+};
+
+} // namespace tesseract
+
+#endif // THIRD_PARTY_TESSERACT_DICT_DICT_H_