123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102 |
- package chardet
- import (
- "bytes"
- )
- type recognizer2022 struct {
- charset string
- escapes [][]byte
- }
- func (r *recognizer2022) Match(input *recognizerInput) (output recognizerOutput) {
- return recognizerOutput{
- Charset: r.charset,
- Confidence: r.matchConfidence(input.input),
- }
- }
- func (r *recognizer2022) matchConfidence(input []byte) int {
- var hits, misses, shifts int
- input:
- for i := 0; i < len(input); i++ {
- c := input[i]
- if c == 0x1B {
- for _, esc := range r.escapes {
- if bytes.HasPrefix(input[i+1:], esc) {
- hits++
- i += len(esc)
- continue input
- }
- }
- misses++
- } else if c == 0x0E || c == 0x0F {
- shifts++
- }
- }
- if hits == 0 {
- return 0
- }
- quality := (100*hits - 100*misses) / (hits + misses)
- if hits+shifts < 5 {
- quality -= (5 - (hits + shifts)) * 10
- }
- if quality < 0 {
- quality = 0
- }
- return quality
- }
- var escapeSequences_2022JP = [][]byte{
- {0x24, 0x28, 0x43},
- {0x24, 0x28, 0x44},
- {0x24, 0x40},
- {0x24, 0x41},
- {0x24, 0x42},
- {0x26, 0x40},
- {0x28, 0x42},
- {0x28, 0x48},
- {0x28, 0x49},
- {0x28, 0x4a},
- {0x2e, 0x41},
- {0x2e, 0x46},
- }
- var escapeSequences_2022KR = [][]byte{
- {0x24, 0x29, 0x43},
- }
- var escapeSequences_2022CN = [][]byte{
- {0x24, 0x29, 0x41},
- {0x24, 0x29, 0x47},
- {0x24, 0x2A, 0x48},
- {0x24, 0x29, 0x45},
- {0x24, 0x2B, 0x49},
- {0x24, 0x2B, 0x4A},
- {0x24, 0x2B, 0x4B},
- {0x24, 0x2B, 0x4C},
- {0x24, 0x2B, 0x4D},
- {0x4e},
- {0x4f},
- }
- func newRecognizer_2022JP() *recognizer2022 {
- return &recognizer2022{
- "ISO-2022-JP",
- escapeSequences_2022JP,
- }
- }
- func newRecognizer_2022KR() *recognizer2022 {
- return &recognizer2022{
- "ISO-2022-KR",
- escapeSequences_2022KR,
- }
- }
- func newRecognizer_2022CN() *recognizer2022 {
- return &recognizer2022{
- "ISO-2022-CN",
- escapeSequences_2022CN,
- }
- }
|