summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'tesseract/src/ccmain/docqual.cpp')
-rw-r--r--tesseract/src/ccmain/docqual.cpp981
1 files changed, 981 insertions, 0 deletions
diff --git a/tesseract/src/ccmain/docqual.cpp b/tesseract/src/ccmain/docqual.cpp
new file mode 100644
index 00000000..f74f9ead
--- /dev/null
+++ b/tesseract/src/ccmain/docqual.cpp
@@ -0,0 +1,981 @@
+/******************************************************************
+ * File: docqual.cpp (Formerly docqual.c)
+ * Description: Document Quality Metrics
+ * Author: Phil Cheatle
+ *
+ * (C) Copyright 1994, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include <cctype>
+#include "docqual.h"
+#include "reject.h"
+#include "tessvars.h"
+#include "tesseractclass.h"
+
+namespace tesseract{
+
+static void countMatchingBlobs(int16_t& match_count, int /*index*/) {
+ ++match_count;
+}
+
+static void countAcceptedBlobs(WERD_RES* word, int16_t& match_count,
+ int16_t& accepted_match_count, int index) {
+ if (word->reject_map[index].accepted()) {
+ ++accepted_match_count;
+ }
+ ++match_count;
+}
+
+static void acceptIfGoodQuality(WERD_RES* word, int index) {
+ if (word->reject_map[index].accept_if_good_quality()) {
+ word->reject_map[index].setrej_quality_accept();
+ }
+}
+
+/*************************************************************************
+ * word_blob_quality()
+ * How many blobs in the box_word are identical to those of the inword?
+ * ASSUME blobs in both initial word and box_word are in ascending order of
+ * left hand blob edge.
+ *************************************************************************/
+int16_t Tesseract::word_blob_quality(WERD_RES* word) {
+ int16_t match_count = 0;
+ if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&
+ !word->rebuild_word->blobs.empty()) {
+ using namespace std::placeholders; // for _1
+ word->bln_boxes->ProcessMatchedBlobs(
+ *word->rebuild_word,
+ std::bind(countMatchingBlobs, match_count, _1));
+ }
+ return match_count;
+}
+
+int16_t Tesseract::word_outline_errs(WERD_RES *word) {
+ int16_t i = 0;
+ int16_t err_count = 0;
+
+ if (word->rebuild_word != nullptr) {
+ for (int b = 0; b < word->rebuild_word->NumBlobs(); ++b) {
+ TBLOB* blob = word->rebuild_word->blobs[b];
+ err_count += count_outline_errs(word->best_choice->unichar_string()[i],
+ blob->NumOutlines());
+ i++;
+ }
+ }
+ return err_count;
+}
+
+/*************************************************************************
+ * word_char_quality()
+ * Combination of blob quality and outline quality - how many good chars are
+ * there? - I.e chars which pass the blob AND outline tests.
+ *************************************************************************/
+void Tesseract::word_char_quality(WERD_RES* word, int16_t* match_count,
+ int16_t* accepted_match_count) {
+ *match_count = 0;
+ *accepted_match_count = 0;
+ if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&
+ !word->rebuild_word->blobs.empty()) {
+ using namespace std::placeholders; // for _1
+ word->bln_boxes->ProcessMatchedBlobs(
+ *word->rebuild_word,
+ std::bind(countAcceptedBlobs,
+ word, *match_count, *accepted_match_count, _1));
+ }
+}
+
+/*************************************************************************
+ * unrej_good_chs()
+ * Unreject POTENTIAL rejects if the blob passes the blob and outline checks
+ *************************************************************************/
+void Tesseract::unrej_good_chs(WERD_RES* word) {
+ if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&
+ word->rebuild_word->blobs.empty()) {
+ using namespace std::placeholders; // for _1
+ word->bln_boxes->ProcessMatchedBlobs(
+ *word->rebuild_word, std::bind(acceptIfGoodQuality, word, _1));
+ }
+}
+
+int16_t Tesseract::count_outline_errs(char c, int16_t outline_count) {
+ int expected_outline_count;
+
+ if (STRING (outlines_odd).contains (c))
+ return 0; // Don't use this char
+ else if (STRING (outlines_2).contains (c))
+ expected_outline_count = 2;
+ else
+ expected_outline_count = 1;
+ return abs (outline_count - expected_outline_count);
+}
+
+void Tesseract::quality_based_rejection(PAGE_RES_IT &page_res_it,
+ bool good_quality_doc) {
+ if ((tessedit_good_quality_unrej && good_quality_doc))
+ unrej_good_quality_words(page_res_it);
+ doc_and_block_rejection(page_res_it, good_quality_doc);
+ if (unlv_tilde_crunching) {
+ tilde_crunch(page_res_it);
+ tilde_delete(page_res_it);
+ }
+}
+
+/*************************************************************************
+ * unrej_good_quality_words()
+ * Accept potential rejects in words which pass the following checks:
+ * - Contains a potential reject
+ * - Word looks like a sensible alpha word.
+ * - Word segmentation is the same as the original image
+ * - All characters have the expected number of outlines
+ * NOTE - the rejection counts are recalculated after unrejection
+ * - CAN'T do it in a single pass without a bit of fiddling
+ * - keep it simple but inefficient
+ *************************************************************************/
+void Tesseract::unrej_good_quality_words( //unreject potential
+ PAGE_RES_IT &page_res_it) {
+ WERD_RES *word;
+ ROW_RES *current_row;
+ BLOCK_RES *current_block;
+ int i;
+
+ page_res_it.restart_page ();
+ while (page_res_it.word () != nullptr) {
+ check_debug_pt (page_res_it.word (), 100);
+ if (bland_unrej) {
+ word = page_res_it.word ();
+ for (i = 0; i < word->reject_map.length (); i++) {
+ if (word->reject_map[i].accept_if_good_quality ())
+ word->reject_map[i].setrej_quality_accept ();
+ }
+ page_res_it.forward ();
+ }
+ else if ((page_res_it.row ()->char_count > 0) &&
+ ((page_res_it.row ()->rej_count /
+ static_cast<float>(page_res_it.row ()->char_count)) <=
+ quality_rowrej_pc)) {
+ word = page_res_it.word ();
+ if (word->reject_map.quality_recoverable_rejects() &&
+ (tessedit_unrej_any_wd ||
+ acceptable_word_string(*word->uch_set,
+ word->best_choice->unichar_string().c_str(),
+ word->best_choice->unichar_lengths().c_str())
+ != AC_UNACCEPTABLE)) {
+ unrej_good_chs(word);
+ }
+ page_res_it.forward ();
+ }
+ else {
+ // Skip to end of dodgy row.
+ current_row = page_res_it.row ();
+ while ((page_res_it.word () != nullptr) &&
+ (page_res_it.row () == current_row))
+ page_res_it.forward ();
+ }
+ check_debug_pt (page_res_it.word (), 110);
+ }
+ page_res_it.restart_page ();
+ page_res_it.page_res->char_count = 0;
+ page_res_it.page_res->rej_count = 0;
+ current_block = nullptr;
+ current_row = nullptr;
+ while (page_res_it.word () != nullptr) {
+ if (current_block != page_res_it.block ()) {
+ current_block = page_res_it.block ();
+ current_block->char_count = 0;
+ current_block->rej_count = 0;
+ }
+ if (current_row != page_res_it.row ()) {
+ current_row = page_res_it.row ();
+ current_row->char_count = 0;
+ current_row->rej_count = 0;
+ current_row->whole_word_rej_count = 0;
+ }
+ page_res_it.rej_stat_word ();
+ page_res_it.forward ();
+ }
+}
+
+
+/*************************************************************************
+ * doc_and_block_rejection()
+ *
+ * If the page has too many rejects - reject all of it.
+ * If any block has too many rejects - reject all words in the block
+ *************************************************************************/
+
+void Tesseract::doc_and_block_rejection( //reject big chunks
+ PAGE_RES_IT &page_res_it,
+ bool good_quality_doc) {
+ int16_t block_no = 0;
+ int16_t row_no = 0;
+ BLOCK_RES *current_block;
+ ROW_RES *current_row;
+
+ bool rej_word;
+ bool prev_word_rejected;
+ int16_t char_quality = 0;
+ int16_t accepted_char_quality;
+
+ if (page_res_it.page_res->rej_count * 100.0 /
+ page_res_it.page_res->char_count > tessedit_reject_doc_percent) {
+ reject_whole_page(page_res_it);
+ if (tessedit_debug_doc_rejection) {
+ tprintf("REJECT ALL #chars: %d #Rejects: %d; \n",
+ page_res_it.page_res->char_count,
+ page_res_it.page_res->rej_count);
+ }
+ } else {
+ if (tessedit_debug_doc_rejection) {
+ tprintf("NO PAGE REJECTION #chars: %d # Rejects: %d; \n",
+ page_res_it.page_res->char_count,
+ page_res_it.page_res->rej_count);
+ }
+
+ /* Walk blocks testing for block rejection */
+
+ page_res_it.restart_page();
+ WERD_RES* word;
+ while ((word = page_res_it.word()) != nullptr) {
+ current_block = page_res_it.block();
+ block_no = current_block->block->pdblk.index();
+ if (current_block->char_count > 0 &&
+ (current_block->rej_count * 100.0 / current_block->char_count) >
+ tessedit_reject_block_percent) {
+ if (tessedit_debug_block_rejection) {
+ tprintf("REJECTING BLOCK %d #chars: %d; #Rejects: %d\n",
+ block_no, current_block->char_count,
+ current_block->rej_count);
+ }
+ prev_word_rejected = false;
+ while ((word = page_res_it.word()) != nullptr &&
+ (page_res_it.block() == current_block)) {
+ if (tessedit_preserve_blk_rej_perfect_wds) {
+ rej_word = word->reject_map.reject_count() > 0 ||
+ word->reject_map.length () < tessedit_preserve_min_wd_len;
+ if (rej_word && tessedit_dont_blkrej_good_wds &&
+ word->reject_map.length() >= tessedit_preserve_min_wd_len &&
+ acceptable_word_string(
+ *word->uch_set,
+ word->best_choice->unichar_string().c_str(),
+ word->best_choice->unichar_lengths().c_str()) !=
+ AC_UNACCEPTABLE) {
+ word_char_quality(word, &char_quality, &accepted_char_quality);
+ rej_word = char_quality != word->reject_map.length();
+ }
+ } else {
+ rej_word = true;
+ }
+ if (rej_word) {
+ /*
+ Reject spacing if both current and prev words are rejected.
+ NOTE - this is NOT restricted to FUZZY spaces. - When tried this
+ generated more space errors.
+ */
+ if (tessedit_use_reject_spaces &&
+ prev_word_rejected &&
+ page_res_it.prev_row() == page_res_it.row() &&
+ word->word->space() == 1)
+ word->reject_spaces = true;
+ word->reject_map.rej_word_block_rej();
+ }
+ prev_word_rejected = rej_word;
+ page_res_it.forward();
+ }
+ } else {
+ if (tessedit_debug_block_rejection) {
+ tprintf("NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n",
+ block_no, page_res_it.block()->char_count,
+ page_res_it.block()->rej_count);
+ }
+
+ /* Walk rows in block testing for row rejection */
+ row_no = 0;
+ while (page_res_it.word() != nullptr &&
+ page_res_it.block() == current_block) {
+ current_row = page_res_it.row();
+ row_no++;
+ /* Reject whole row if:
+ fraction of chars on row which are rejected exceed a limit AND
+ fraction rejects which occur in WHOLE WERD rejects is LESS THAN a
+ limit
+ */
+ if (current_row->char_count > 0 &&
+ (current_row->rej_count * 100.0 / current_row->char_count) >
+ tessedit_reject_row_percent &&
+ (current_row->whole_word_rej_count * 100.0 /
+ current_row->rej_count) <
+ tessedit_whole_wd_rej_row_percent) {
+ if (tessedit_debug_block_rejection) {
+ tprintf("REJECTING ROW %d #chars: %d; #Rejects: %d\n",
+ row_no, current_row->char_count,
+ current_row->rej_count);
+ }
+ prev_word_rejected = false;
+ while ((word = page_res_it.word()) != nullptr &&
+ page_res_it.row () == current_row) {
+ /* Preserve words on good docs unless they are mostly rejected*/
+ if (!tessedit_row_rej_good_docs && good_quality_doc) {
+ rej_word = word->reject_map.reject_count() /
+ static_cast<float>(word->reject_map.length()) >
+ tessedit_good_doc_still_rowrej_wd;
+ } else if (tessedit_preserve_row_rej_perfect_wds) {
+ /* Preserve perfect words anyway */
+ rej_word = word->reject_map.reject_count() > 0 ||
+ word->reject_map.length () < tessedit_preserve_min_wd_len;
+ if (rej_word && tessedit_dont_rowrej_good_wds &&
+ word->reject_map.length() >= tessedit_preserve_min_wd_len &&
+ acceptable_word_string(*word->uch_set,
+ word->best_choice->unichar_string().c_str(),
+ word->best_choice->unichar_lengths().c_str()) !=
+ AC_UNACCEPTABLE) {
+ word_char_quality(word, &char_quality,
+ &accepted_char_quality);
+ rej_word = char_quality != word->reject_map.length();
+ }
+ } else {
+ rej_word = true;
+ }
+ if (rej_word) {
+ /*
+ Reject spacing if both current and prev words are rejected.
+ NOTE - this is NOT restricted to FUZZY spaces. - When tried
+ this generated more space errors.
+ */
+ if (tessedit_use_reject_spaces &&
+ prev_word_rejected &&
+ page_res_it.prev_row() == page_res_it.row() &&
+ word->word->space () == 1)
+ word->reject_spaces = true;
+ word->reject_map.rej_word_row_rej();
+ }
+ prev_word_rejected = rej_word;
+ page_res_it.forward();
+ }
+ } else {
+ if (tessedit_debug_block_rejection) {
+ tprintf("NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n",
+ row_no, current_row->char_count, current_row->rej_count);
+ }
+ while (page_res_it.word() != nullptr &&
+ page_res_it.row() == current_row)
+ page_res_it.forward();
+ }
+ }
+ }
+ }
+ }
+}
+
+/*************************************************************************
+ * reject_whole_page()
+ * Don't believe any of it - set the reject map to 00..00 in all words
+ *
+ *************************************************************************/
+
+void reject_whole_page(PAGE_RES_IT &page_res_it) {
+ page_res_it.restart_page ();
+ while (page_res_it.word () != nullptr) {
+ page_res_it.word ()->reject_map.rej_word_doc_rej ();
+ page_res_it.forward ();
+ }
+ //whole page is rejected
+ page_res_it.page_res->rejected = true;
+}
+
+void Tesseract::tilde_crunch(PAGE_RES_IT &page_res_it) {
+ WERD_RES *word;
+ GARBAGE_LEVEL garbage_level;
+ PAGE_RES_IT copy_it;
+ bool prev_potential_marked = false;
+ bool found_terrible_word = false;
+ bool ok_dict_word;
+
+ page_res_it.restart_page();
+ while (page_res_it.word() != nullptr) {
+ POLY_BLOCK* pb = page_res_it.block()->block->pdblk.poly_block();
+ if (pb != nullptr && !pb->IsText()) {
+ page_res_it.forward();
+ continue;
+ }
+ word = page_res_it.word();
+
+ if (crunch_early_convert_bad_unlv_chs)
+ convert_bad_unlv_chs(word);
+
+ if (crunch_early_merge_tess_fails)
+ word->merge_tess_fails();
+
+ if (word->reject_map.accept_count () != 0) {
+ found_terrible_word = false;
+ //Forget earlier potential crunches
+ prev_potential_marked = false;
+ }
+ else {
+ ok_dict_word = safe_dict_word(word);
+ garbage_level = garbage_word(word, ok_dict_word);
+
+ if ((garbage_level != G_NEVER_CRUNCH) &&
+ (terrible_word_crunch (word, garbage_level))) {
+ if (crunch_debug > 0) {
+ tprintf ("T CRUNCHING: \"%s\"\n",
+ word->best_choice->unichar_string().c_str());
+ }
+ word->unlv_crunch_mode = CR_KEEP_SPACE;
+ if (prev_potential_marked) {
+ while (copy_it.word () != word) {
+ if (crunch_debug > 0) {
+ tprintf ("P1 CRUNCHING: \"%s\"\n",
+ copy_it.word()->best_choice->unichar_string().c_str());
+ }
+ copy_it.word ()->unlv_crunch_mode = CR_KEEP_SPACE;
+ copy_it.forward ();
+ }
+ prev_potential_marked = false;
+ }
+ found_terrible_word = true;
+ }
+ else if ((garbage_level != G_NEVER_CRUNCH) &&
+ (potential_word_crunch (word,
+ garbage_level, ok_dict_word))) {
+ if (found_terrible_word) {
+ if (crunch_debug > 0) {
+ tprintf ("P2 CRUNCHING: \"%s\"\n",
+ word->best_choice->unichar_string().c_str());
+ }
+ word->unlv_crunch_mode = CR_KEEP_SPACE;
+ }
+ else if (!prev_potential_marked) {
+ copy_it = page_res_it;
+ prev_potential_marked = true;
+ if (crunch_debug > 1) {
+ tprintf ("P3 CRUNCHING: \"%s\"\n",
+ word->best_choice->unichar_string().c_str());
+ }
+ }
+ }
+ else {
+ found_terrible_word = false;
+ //Forget earlier potential crunches
+ prev_potential_marked = false;
+ if (crunch_debug > 2) {
+ tprintf ("NO CRUNCH: \"%s\"\n",
+ word->best_choice->unichar_string().c_str());
+ }
+ }
+ }
+ page_res_it.forward ();
+ }
+}
+
+
+bool Tesseract::terrible_word_crunch(WERD_RES* word,
+ GARBAGE_LEVEL garbage_level) {
+ float rating_per_ch;
+ int adjusted_len;
+ int crunch_mode = 0;
+
+ if ((word->best_choice->unichar_string().length() == 0) ||
+ (strspn(word->best_choice->unichar_string().c_str(), " ") ==
+ word->best_choice->unichar_string().unsigned_size()))
+ crunch_mode = 1;
+ else {
+ adjusted_len = word->reject_map.length ();
+ if (adjusted_len > crunch_rating_max)
+ adjusted_len = crunch_rating_max;
+ rating_per_ch = word->best_choice->rating () / adjusted_len;
+
+ if (rating_per_ch > crunch_terrible_rating)
+ crunch_mode = 2;
+ else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE))
+ crunch_mode = 3;
+ else if ((word->best_choice->certainty () < crunch_poor_garbage_cert) &&
+ (garbage_level != G_OK))
+ crunch_mode = 4;
+ else if ((rating_per_ch > crunch_poor_garbage_rate) &&
+ (garbage_level != G_OK))
+ crunch_mode = 5;
+ }
+ if (crunch_mode > 0) {
+ if (crunch_debug > 2) {
+ tprintf ("Terrible_word_crunch (%d) on \"%s\"\n",
+ crunch_mode, word->best_choice->unichar_string().c_str());
+ }
+ return true;
+ }
+ else
+ return false;
+}
+
+bool Tesseract::potential_word_crunch(WERD_RES* word,
+ GARBAGE_LEVEL garbage_level,
+ bool ok_dict_word) {
+ float rating_per_ch;
+ int adjusted_len;
+ const char *str = word->best_choice->unichar_string().c_str();
+ const char *lengths = word->best_choice->unichar_lengths().c_str();
+ bool word_crunchable;
+ int poor_indicator_count = 0;
+
+ word_crunchable = !crunch_leave_accept_strings ||
+ word->reject_map.length() < 3 ||
+ (acceptable_word_string(*word->uch_set,
+ str, lengths) == AC_UNACCEPTABLE &&
+ !ok_dict_word);
+
+ adjusted_len = word->reject_map.length();
+ if (adjusted_len > 10)
+ adjusted_len = 10;
+ rating_per_ch = word->best_choice->rating() / adjusted_len;
+
+ if (rating_per_ch > crunch_pot_poor_rate) {
+ if (crunch_debug > 2) {
+ tprintf("Potential poor rating on \"%s\"\n",
+ word->best_choice->unichar_string().c_str());
+ }
+ poor_indicator_count++;
+ }
+
+ if (word_crunchable &&
+ word->best_choice->certainty() < crunch_pot_poor_cert) {
+ if (crunch_debug > 2) {
+ tprintf("Potential poor cert on \"%s\"\n",
+ word->best_choice->unichar_string().c_str());
+ }
+ poor_indicator_count++;
+ }
+
+ if (garbage_level != G_OK) {
+ if (crunch_debug > 2) {
+ tprintf("Potential garbage on \"%s\"\n",
+ word->best_choice->unichar_string().c_str());
+ }
+ poor_indicator_count++;
+ }
+ return poor_indicator_count >= crunch_pot_indicators;
+}
+
+void Tesseract::tilde_delete(PAGE_RES_IT &page_res_it) {
+ WERD_RES *word;
+ PAGE_RES_IT copy_it;
+ bool deleting_from_bol = false;
+ bool marked_delete_point = false;
+ int16_t debug_delete_mode;
+ CRUNCH_MODE delete_mode;
+ int16_t x_debug_delete_mode;
+ CRUNCH_MODE x_delete_mode;
+
+ page_res_it.restart_page();
+ while (page_res_it.word() != nullptr) {
+ word = page_res_it.word();
+
+ delete_mode = word_deletable (word, debug_delete_mode);
+ if (delete_mode != CR_NONE) {
+ if (word->word->flag (W_BOL) || deleting_from_bol) {
+ if (crunch_debug > 0) {
+ tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n",
+ debug_delete_mode,
+ word->best_choice->unichar_string().c_str());
+ }
+ word->unlv_crunch_mode = delete_mode;
+ deleting_from_bol = true;
+ } else if (word->word->flag(W_EOL)) {
+ if (marked_delete_point) {
+ while (copy_it.word() != word) {
+ x_delete_mode = word_deletable (copy_it.word (),
+ x_debug_delete_mode);
+ if (crunch_debug > 0) {
+ tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
+ x_debug_delete_mode,
+ copy_it.word()->best_choice->unichar_string().c_str());
+ }
+ copy_it.word ()->unlv_crunch_mode = x_delete_mode;
+ copy_it.forward ();
+ }
+ }
+ if (crunch_debug > 0) {
+ tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
+ debug_delete_mode,
+ word->best_choice->unichar_string().c_str());
+ }
+ word->unlv_crunch_mode = delete_mode;
+ deleting_from_bol = false;
+ marked_delete_point = false;
+ }
+ else {
+ if (!marked_delete_point) {
+ copy_it = page_res_it;
+ marked_delete_point = true;
+ }
+ }
+ }
+ else {
+ deleting_from_bol = false;
+ //Forget earlier potential crunches
+ marked_delete_point = false;
+ }
+ /*
+ The following step has been left till now as the tess fails are used to
+ determine if the word is deletable.
+ */
+ if (!crunch_early_merge_tess_fails)
+ word->merge_tess_fails();
+ page_res_it.forward ();
+ }
+}
+
+
+void Tesseract::convert_bad_unlv_chs(WERD_RES *word_res) {
+ int i;
+ UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
+ UNICHAR_ID unichar_space = word_res->uch_set->unichar_to_id(" ");
+ UNICHAR_ID unichar_tilde = word_res->uch_set->unichar_to_id("~");
+ UNICHAR_ID unichar_pow = word_res->uch_set->unichar_to_id("^");
+ for (i = 0; i < word_res->reject_map.length(); ++i) {
+ if (word_res->best_choice->unichar_id(i) == unichar_tilde) {
+ word_res->best_choice->set_unichar_id(unichar_dash, i);
+ if (word_res->reject_map[i].accepted ())
+ word_res->reject_map[i].setrej_unlv_rej ();
+ }
+ if (word_res->best_choice->unichar_id(i) == unichar_pow) {
+ word_res->best_choice->set_unichar_id(unichar_space, i);
+ if (word_res->reject_map[i].accepted ())
+ word_res->reject_map[i].setrej_unlv_rej ();
+ }
+ }
+}
+
+GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, bool ok_dict_word) {
+ enum STATES
+ {
+ JUNK,
+ FIRST_UPPER,
+ FIRST_LOWER,
+ FIRST_NUM,
+ SUBSEQUENT_UPPER,
+ SUBSEQUENT_LOWER,
+ SUBSEQUENT_NUM
+ };
+ const char *str = word->best_choice->unichar_string().c_str();
+ const char *lengths = word->best_choice->unichar_lengths().c_str();
+ STATES state = JUNK;
+ int len = 0;
+ int isolated_digits = 0;
+ int isolated_alphas = 0;
+ int bad_char_count = 0;
+ int tess_rejs = 0;
+ int dodgy_chars = 0;
+ int ok_chars;
+ UNICHAR_ID last_char = -1;
+ int alpha_repetition_count = 0;
+ int longest_alpha_repetition_count = 0;
+ int longest_lower_run_len = 0;
+ int lower_string_count = 0;
+ int longest_upper_run_len = 0;
+ int upper_string_count = 0;
+ int total_alpha_count = 0;
+ int total_digit_count = 0;
+
+ for (; *str != '\0'; str += *(lengths++)) {
+ len++;
+ if (word->uch_set->get_isupper (str, *lengths)) {
+ total_alpha_count++;
+ switch (state) {
+ case SUBSEQUENT_UPPER:
+ case FIRST_UPPER:
+ state = SUBSEQUENT_UPPER;
+ upper_string_count++;
+ if (longest_upper_run_len < upper_string_count)
+ longest_upper_run_len = upper_string_count;
+ if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
+ alpha_repetition_count++;
+ if (longest_alpha_repetition_count < alpha_repetition_count) {
+ longest_alpha_repetition_count = alpha_repetition_count;
+ }
+ }
+ else {
+ last_char = word->uch_set->unichar_to_id(str, *lengths);
+ alpha_repetition_count = 1;
+ }
+ break;
+ case FIRST_NUM:
+ isolated_digits++;
+ // Fall through.
+ default:
+ state = FIRST_UPPER;
+ last_char = word->uch_set->unichar_to_id(str, *lengths);
+ alpha_repetition_count = 1;
+ upper_string_count = 1;
+ break;
+ }
+ }
+ else if (word->uch_set->get_islower (str, *lengths)) {
+ total_alpha_count++;
+ switch (state) {
+ case SUBSEQUENT_LOWER:
+ case FIRST_LOWER:
+ state = SUBSEQUENT_LOWER;
+ lower_string_count++;
+ if (longest_lower_run_len < lower_string_count)
+ longest_lower_run_len = lower_string_count;
+ if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
+ alpha_repetition_count++;
+ if (longest_alpha_repetition_count < alpha_repetition_count) {
+ longest_alpha_repetition_count = alpha_repetition_count;
+ }
+ }
+ else {
+ last_char = word->uch_set->unichar_to_id(str, *lengths);
+ alpha_repetition_count = 1;
+ }
+ break;
+ case FIRST_NUM:
+ isolated_digits++;
+ // Fall through.
+ default:
+ state = FIRST_LOWER;
+ last_char = word->uch_set->unichar_to_id(str, *lengths);
+ alpha_repetition_count = 1;
+ lower_string_count = 1;
+ break;
+ }
+ }
+ else if (word->uch_set->get_isdigit (str, *lengths)) {
+ total_digit_count++;
+ switch (state) {
+ case FIRST_NUM:
+ state = SUBSEQUENT_NUM;
+ case SUBSEQUENT_NUM:
+ break;
+ case FIRST_UPPER:
+ case FIRST_LOWER:
+ isolated_alphas++;
+ // Fall through.
+ default:
+ state = FIRST_NUM;
+ break;
+ }
+ }
+ else {
+ if (*lengths == 1 && *str == ' ')
+ tess_rejs++;
+ else
+ bad_char_count++;
+ switch (state) {
+ case FIRST_NUM:
+ isolated_digits++;
+ break;
+ case FIRST_UPPER:
+ case FIRST_LOWER:
+ isolated_alphas++;
+ default:
+ break;
+ }
+ state = JUNK;
+ }
+ }
+
+ switch (state) {
+ case FIRST_NUM:
+ isolated_digits++;
+ break;
+ case FIRST_UPPER:
+ case FIRST_LOWER:
+ isolated_alphas++;
+ default:
+ break;
+ }
+
+ if (crunch_include_numerals) {
+ total_alpha_count += total_digit_count - isolated_digits;
+ }
+
+ if (crunch_leave_ok_strings && len >= 4 &&
+ 2 * (total_alpha_count - isolated_alphas) > len &&
+ longest_alpha_repetition_count < crunch_long_repetitions) {
+ if ((crunch_accept_ok &&
+ acceptable_word_string(*word->uch_set, str, lengths) !=
+ AC_UNACCEPTABLE) ||
+ longest_lower_run_len > crunch_leave_lc_strings ||
+ longest_upper_run_len > crunch_leave_uc_strings)
+ return G_NEVER_CRUNCH;
+ }
+ if (word->reject_map.length() > 1 &&
+ strpbrk(str, " ") == nullptr &&
+ (word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
+ word->best_choice->permuter() == FREQ_DAWG_PERM ||
+ word->best_choice->permuter() == USER_DAWG_PERM ||
+ word->best_choice->permuter() == NUMBER_PERM ||
+ acceptable_word_string(*word->uch_set, str, lengths) !=
+ AC_UNACCEPTABLE || ok_dict_word))
+ return G_OK;
+
+ ok_chars = len - bad_char_count - isolated_digits -
+ isolated_alphas - tess_rejs;
+
+ if (crunch_debug > 3) {
+ tprintf("garbage_word: \"%s\"\n",
+ word->best_choice->unichar_string().c_str());
+ tprintf("LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n",
+ len,
+ bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
+ }
+ if (bad_char_count == 0 &&
+ tess_rejs == 0 &&
+ (len > isolated_digits + isolated_alphas || len <= 2))
+ return G_OK;
+
+ if (tess_rejs > ok_chars ||
+ (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len))
+ return G_TERRIBLE;
+
+ if (len > 4) {
+ dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits +
+ isolated_alphas;
+ if (dodgy_chars > 5 || (dodgy_chars / static_cast<float>(len)) > 0.5)
+ return G_DODGY;
+ else
+ return G_OK;
+ } else {
+ dodgy_chars = 2 * tess_rejs + bad_char_count;
+ if ((len == 4 && dodgy_chars > 2) ||
+ (len == 3 && dodgy_chars > 2) || dodgy_chars >= len)
+ return G_DODGY;
+ else
+ return G_OK;
+ }
+}
+
+
+/*************************************************************************
+ * word_deletable()
+ * DELETE WERDS AT ENDS OF ROWS IF
+ * Word is crunched &&
+ * ( string length = 0 OR
+ * > 50% of chars are "|" (before merging) OR
+ * certainty < -10 OR
+ * rating /char > 60 OR
+ * TOP of word is more than 0.5 xht BELOW baseline OR
+ * BOTTOM of word is more than 0.5 xht ABOVE xht OR
+ * length of word < 3xht OR
+ * height of word < 0.7 xht OR
+ * height of word > 3.0 xht OR
+ * >75% of the outline BBs have longest dimension < 0.5xht
+ *************************************************************************/
+
+CRUNCH_MODE Tesseract::word_deletable(WERD_RES *word, int16_t &delete_mode) {
+ int word_len = word->reject_map.length ();
+ float rating_per_ch;
+ TBOX box; //BB of word
+
+ if (word->unlv_crunch_mode == CR_NONE) {
+ delete_mode = 0;
+ return CR_NONE;
+ }
+
+ if (word_len == 0) {
+ delete_mode = 1;
+ return CR_DELETE;
+ }
+
+ if (word->rebuild_word != nullptr) {
+ // Cube leaves rebuild_word nullptr.
+ box = word->rebuild_word->bounding_box();
+ if (box.height () < crunch_del_min_ht * kBlnXHeight) {
+ delete_mode = 4;
+ return CR_DELETE;
+ }
+
+ if (noise_outlines(word->rebuild_word)) {
+ delete_mode = 5;
+ return CR_DELETE;
+ }
+ }
+
+ if ((failure_count (word) * 1.5) > word_len) {
+ delete_mode = 2;
+ return CR_LOOSE_SPACE;
+ }
+
+ if (word->best_choice->certainty () < crunch_del_cert) {
+ delete_mode = 7;
+ return CR_LOOSE_SPACE;
+ }
+
+ rating_per_ch = word->best_choice->rating () / word_len;
+
+ if (rating_per_ch > crunch_del_rating) {
+ delete_mode = 8;
+ return CR_LOOSE_SPACE;
+ }
+
+ if (box.top () < kBlnBaselineOffset - crunch_del_low_word * kBlnXHeight) {
+ delete_mode = 9;
+ return CR_LOOSE_SPACE;
+ }
+
+ if (box.bottom () >
+ kBlnBaselineOffset + crunch_del_high_word * kBlnXHeight) {
+ delete_mode = 10;
+ return CR_LOOSE_SPACE;
+ }
+
+ if (box.height () > crunch_del_max_ht * kBlnXHeight) {
+ delete_mode = 11;
+ return CR_LOOSE_SPACE;
+ }
+
+ if (box.width () < crunch_del_min_width * kBlnXHeight) {
+ delete_mode = 3;
+ return CR_LOOSE_SPACE;
+ }
+
+ delete_mode = 0;
+ return CR_NONE;
+}
+
+int16_t Tesseract::failure_count(WERD_RES *word) {
+ const char *str = word->best_choice->unichar_string().c_str();
+ int tess_rejs = 0;
+
+ for (; *str != '\0'; str++) {
+ if (*str == ' ')
+ tess_rejs++;
+ }
+ return tess_rejs;
+}
+
+
+bool Tesseract::noise_outlines(TWERD* word) {
+ TBOX box; // BB of outline
+ int16_t outline_count = 0;
+ int16_t small_outline_count = 0;
+ int16_t max_dimension;
+ float small_limit = kBlnXHeight * crunch_small_outlines_size;
+
+ for (int b = 0; b < word->NumBlobs(); ++b) {
+ TBLOB* blob = word->blobs[b];
+ for (TESSLINE* ol = blob->outlines; ol != nullptr; ol = ol->next) {
+ outline_count++;
+ box = ol->bounding_box();
+ if (box.height() > box.width())
+ max_dimension = box.height();
+ else
+ max_dimension = box.width();
+ if (max_dimension < small_limit)
+ small_outline_count++;
+ }
+ }
+ return small_outline_count >= outline_count;
+}
+
+} // namespace tesseract