diff options
Diffstat (limited to 'tesseract/src/training/unicharset/validate_javanese.cpp')
-rw-r--r-- | tesseract/src/training/unicharset/validate_javanese.cpp | 275 |
1 files changed, 275 insertions, 0 deletions
diff --git a/tesseract/src/training/unicharset/validate_javanese.cpp b/tesseract/src/training/unicharset/validate_javanese.cpp new file mode 100644 index 00000000..410cf540 --- /dev/null +++ b/tesseract/src/training/unicharset/validate_javanese.cpp @@ -0,0 +1,275 @@ +/********************************************************************** + * File: validate_javanese.cpp + * Description: Text validator for Javanese Script - aksara jawa. + * Author: Shree Devi Kumar + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + **********************************************************************/ + +#include "validate_javanese.h" +#include "errcode.h" +#include "tprintf.h" + +namespace tesseract { + +// Returns whether codes matches the pattern for a Javanese Grapheme. +// Taken from unicode standard: +// http://www.unicode.org/charts/PDF/UA980.pdf +// http://www.unicode.org/versions/Unicode11.0.0/ch17.pdf +// The Consonant class here includes independent vowels. +// The order of components in an orthographic syllable as expressed in BNF is: +// {C F} C {{R}Y} {V{A}} {Z} +// Translated to the codes used by the CharClass enum: +// [(V|C[N])(H)] (V|C[N]) [[N]N] [M[D]] [v] +// Also see https://r12a.github.io/scripts/javanese/ for detailed notes. +// Validation rules copied from validate_indic.cpp and modified for Javanese. +// Indic - for reference +// + vowel Grapheme: V[D](v)* +// + consonant Grapheme: (C[N](H|HZ|Hz|ZH)?)*C[N](H|Hz)?[M[P]][D](v)* + +bool ValidateJavanese::ConsumeGraphemeIfValid() { + switch (codes_[codes_used_].first) { + case CharClass::kConsonant: + return ConsumeConsonantHeadIfValid() && ConsumeConsonantTailIfValid(); + case CharClass::kVowel: + case CharClass::kVedicMark: + return ConsumeVowelIfValid(); + case CharClass::kZeroWidthJoiner: + case CharClass::kZeroWidthNonJoiner: + // Apart from within an aksara, joiners are silently dropped. + if (report_errors_) + tprintf("Dropping isolated joiner: 0x%x\n", codes_[codes_used_].second); + ++codes_used_; + return true; + case CharClass::kOther: + UseMultiCode(1); + return true; + default: + if (report_errors_) { + tprintf("Invalid start of grapheme sequence:%c=0x%x\n", + codes_[codes_used_].first, codes_[codes_used_].second); + } + return false; + } +} + +// Helper consumes/copies a virama and any associated post-virama joiners. +// A linking virama (with either type of pre-virama joiner, post-virama ZWJ, or +// no joiner at all) must be followed by a consonant. +// A non-linking (explicit) virama is indicated by a ZWNJ after it, or a non +// consonant, space, or character from a different script. We clean up the +// representation to make it consistent by adding a ZWNJ if missing from a +// non-linking virama. Returns false with an invalid sequence. +bool ValidateJavanese::ConsumeViramaIfValid(IndicPair joiner, bool post_matra) { + const unsigned num_codes = codes_.size(); + if (joiner.first == CharClass::kOther) { + CodeOnlyToOutput(); + if (codes_used_ < num_codes && + codes_[codes_used_].second == kZeroWidthJoiner) { + // Post-matra viramas must be explicit, so no joiners allowed here. + if (post_matra) { + if (report_errors_) tprintf("ZWJ after a post-matra virama!!\n"); + return false; + } + if (codes_used_ + 1 < num_codes && + codes_[codes_used_ - 2].second != kCakra && + (codes_[codes_used_ + 1].second == kZeroWidthNonJoiner || + codes_[codes_used_ + 1].second == kPengkal || + codes_[codes_used_ + 1].second == kCakra)) { + // This combination will be picked up later. + ASSERT_HOST(!CodeOnlyToOutput()); + } else { + // Half-form with optional Nukta. + unsigned len = output_.size() + 1 - output_used_; + if (UseMultiCode(len)) return true; + } + if (codes_used_ < num_codes && + codes_[codes_used_].second == kZeroWidthNonJoiner) { + if (output_used_ == output_.size() || + output_[output_used_] != kCakra) { + if (report_errors_) { + tprintf("Virama ZWJ ZWNJ in non-Sinhala: base=0x%x!\n", + static_cast<int>(script_)); + } + return false; + } + // Special Sinhala case of Stand-alone Repaya. ['RA' H Z z] + if (UseMultiCode(4)) return true; + } + } else if (codes_used_ == num_codes || + codes_[codes_used_].first != CharClass::kConsonant || + post_matra) { + if (codes_used_ == num_codes || + codes_[codes_used_].second != kZeroWidthNonJoiner) { + // It is valid to have an unterminated virama at the end of a word, but + // for consistency, we will always add ZWNJ if not present. + CodeOnlyToOutput(); + } else { + CodeOnlyToOutput(); + } + // Explicit virama [H z] + MultiCodePart(2); + } + } else { + // Pre-virama joiner [{Z|z} H] requests specific conjunct. + if (UseMultiCode(2)) { + if (report_errors_) + tprintf("Invalid pre-virama joiner with no 2nd consonant!!\n"); + return false; + } + if (codes_[codes_used_].second == kZeroWidthJoiner || + codes_[codes_used_].second == kZeroWidthNonJoiner) { + if (report_errors_) { + tprintf("JHJ!!: 0x%x 0x%x 0x%x\n", joiner.second, output_.back(), + codes_[codes_used_].second); + } + return false; + } + } + // It is good so far as it goes. + return true; +} + +// Helper consumes/copies a series of consonants separated by viramas while +// valid, but not any vowel or other modifiers. +bool ValidateJavanese::ConsumeConsonantHeadIfValid() { + const unsigned num_codes = codes_.size(); + // Consonant aksara + do { + CodeOnlyToOutput(); + // Special Sinhala case of [H Z Yayana/Rayana]. + int index = output_.size() - 3; + if (output_used_ + 3 <= output_.size() && + (output_.back() == kPengkal || output_.back() == kCakra) && + IsVirama(output_[index]) && output_[index + 1] == kZeroWidthJoiner) { + MultiCodePart(3); + } + bool have_nukta = false; + if (codes_used_ < num_codes && + codes_[codes_used_].first == CharClass::kNukta) { + have_nukta = true; + CodeOnlyToOutput(); + } + // Test for subscript conjunct. + index = output_.size() - 2 - have_nukta; + if (output_used_ + 2 + have_nukta <= output_.size() && IsSubscriptScript() && + IsVirama(output_[index])) { + // Output previous virama, consonant + optional nukta. + MultiCodePart(2 + have_nukta); + } + IndicPair joiner(CharClass::kOther, 0); + if (codes_used_ < num_codes && + (codes_[codes_used_].second == kZeroWidthJoiner || + (codes_[codes_used_].second == kZeroWidthNonJoiner && + script_ == ViramaScript::kMalayalam))) { + joiner = codes_[codes_used_]; + if (++codes_used_ == num_codes) { + if (report_errors_) { + tprintf("Skipping ending joiner: 0x%x 0x%x\n", output_.back(), + joiner.second); + } + return true; + } + if (codes_[codes_used_].first == CharClass::kVirama) { + output_.push_back(joiner.second); + } else { + if (report_errors_) { + tprintf("Skipping unnecessary joiner: 0x%x 0x%x 0x%x\n", + output_.back(), joiner.second, codes_[codes_used_].second); + } + joiner = std::make_pair(CharClass::kOther, 0); + } + } + if (codes_used_ < num_codes && + codes_[codes_used_].first == CharClass::kVirama) { + if (!ConsumeViramaIfValid(joiner, false)) return false; + } else { + break; // No virama, so the run of consonants is over. + } + } while (codes_used_ < num_codes && + codes_[codes_used_].first == CharClass::kConsonant); + if (output_used_ < output_.size()) MultiCodePart(1); + return true; +} + +// Helper consumes/copies a tail part of a consonant, comprising optional +// matra/piece, vowel modifier, vedic mark, terminating virama. +bool ValidateJavanese::ConsumeConsonantTailIfValid() { + if (codes_used_ == codes_.size()) return true; + // No virama: Finish the grapheme. + // Are multiple matras allowed? + if (codes_[codes_used_].first == CharClass::kMatra) { + if (UseMultiCode(1)) return true; + if (codes_[codes_used_].first == CharClass::kMatraPiece) { + if (UseMultiCode(1)) return true; + } + } + // Tarung also used for long versions of u and o vowels and vocalic r + // Taling + Tarung is valid eg. ꦏ + ◌ꦺ + ◌ꦴ + while (codes_[codes_used_].first == CharClass::kMatraPiece) { + if (UseMultiCode(1)) return true; + } + while (codes_[codes_used_].first == CharClass::kVowelModifier) { + if (UseMultiCode(1)) return true; + // Only Malayalam allows only repeated 0xd02. + if (script_ != ViramaScript::kMalayalam || output_.back() != 0xd02) break; + } + while (codes_[codes_used_].first == CharClass::kVedicMark) { + if (UseMultiCode(1)) return true; + } + if (codes_[codes_used_].first == CharClass::kVirama) { + if (!ConsumeViramaIfValid(IndicPair(CharClass::kOther, 0), true)) { + return false; + } + } + // What we have consumed so far is a valid consonant cluster. + if (output_used_ < output_.size()) MultiCodePart(1); + + return true; +} + +// Helper consumes/copies a vowel and optional modifiers. +bool ValidateJavanese::ConsumeVowelIfValid() { + if (UseMultiCode(1)) return true; + while (codes_[codes_used_].first == CharClass::kVowelModifier) { + if (UseMultiCode(1)) return true; + // Only Malayalam allows repeated modifiers? + if (script_ != ViramaScript::kMalayalam) break; + } + while (codes_[codes_used_].first == CharClass::kVedicMark) { + if (UseMultiCode(1)) return true; + } + // What we have consumed so far is a valid vowel cluster. + return true; +} + + +Validator::CharClass ValidateJavanese::UnicodeToCharClass(char32 ch) const { + if (ch == kZeroWidthNonJoiner) return CharClass::kZeroWidthNonJoiner; + if (ch == kZeroWidthJoiner) return CharClass::kZeroWidthJoiner; + // Offset from the start of the relevant unicode code block aka code page. + int off = ch - static_cast<char32>(script_); + // Anything in another code block is other. + if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther; + if (off < 0x4) return CharClass::kVowelModifier; + if (off <= 0x32) return CharClass::kConsonant; // includes independent vowels + if (off == 0x33) return CharClass::kNukta; // A9B3 CECAK TELU + if (off == 0x34) return CharClass::kMatraPiece; // A9B4 TARUNG two part vowels + if (off <= 0x39) return CharClass::kMatra; + if (off <= 0x3a) return CharClass::kConsonant; // A9BA TALING - pre base vowel + if (off <= 0x3d) return CharClass::kMatra; + if (off <= 0x3f) return CharClass::kNukta; // A9BE-A9BF PENGKAL-CAKRA medial consonants + if (off == 0x40) return CharClass::kVirama; // A9C0 PANGKON + return CharClass::kOther; +} + +} // namespace tesseract |