mbcs.go 2.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192
  1. package mahonia
  2. // Generic converters for multibyte character sets.
  3. // An mbcsTrie contains the data to convert from the character set to Unicode.
  4. // If a character would be encoded as "\x01\x02\x03", its unicode value would be found at t.children[1].children[2].children[3].rune
  5. // children either is nil or has 256 elements.
  6. type mbcsTrie struct {
  7. // For leaf nodes, the Unicode character that is represented.
  8. char rune
  9. // For non-leaf nodes, the trie to decode the remainder of the character.
  10. children []mbcsTrie
  11. }
  12. // A MBCSTable holds the data to convert to and from Unicode.
  13. type MBCSTable struct {
  14. toUnicode mbcsTrie
  15. fromUnicode map[rune]string
  16. }
  17. // AddCharacter adds a character to the table. rune is its Unicode code point,
  18. // and bytes contains the bytes used to encode it in the character set.
  19. func (table *MBCSTable) AddCharacter(c rune, bytes string) {
  20. if table.fromUnicode == nil {
  21. table.fromUnicode = make(map[rune]string)
  22. }
  23. table.fromUnicode[c] = bytes
  24. trie := &table.toUnicode
  25. for i := 0; i < len(bytes); i++ {
  26. if trie.children == nil {
  27. trie.children = make([]mbcsTrie, 256)
  28. }
  29. b := bytes[i]
  30. trie = &trie.children[b]
  31. }
  32. trie.char = c
  33. }
  34. func (table *MBCSTable) Decoder() Decoder {
  35. return func(p []byte) (c rune, size int, status Status) {
  36. if len(p) == 0 {
  37. status = NO_ROOM
  38. return
  39. }
  40. if p[0] == 0 {
  41. return 0, 1, SUCCESS
  42. }
  43. trie := &table.toUnicode
  44. for trie.char == 0 {
  45. if trie.children == nil {
  46. return 0xfffd, 1, INVALID_CHAR
  47. }
  48. if len(p) < size+1 {
  49. return 0, 0, NO_ROOM
  50. }
  51. trie = &trie.children[p[size]]
  52. size++
  53. }
  54. c = trie.char
  55. status = SUCCESS
  56. return
  57. }
  58. }
  59. func (table *MBCSTable) Encoder() Encoder {
  60. return func(p []byte, c rune) (size int, status Status) {
  61. bytes := table.fromUnicode[c]
  62. if bytes == "" {
  63. if len(p) > 0 {
  64. p[0] = '?'
  65. return 1, INVALID_CHAR
  66. } else {
  67. return 0, NO_ROOM
  68. }
  69. }
  70. if len(p) < len(bytes) {
  71. return 0, NO_ROOM
  72. }
  73. return copy(p, bytes), SUCCESS
  74. }
  75. }