main.go 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250
  1. /*
  2. * This Source Code Form is subject to the terms of the Mozilla Public
  3. * License, v. 2.0. If a copy of the MPL was not distributed with this
  4. * file, You can obtain one at https://mozilla.org/MPL/2.0/.
  5. */
  6. package main
  7. import (
  8. "context"
  9. "database/sql"
  10. "encoding/csv"
  11. "fmt"
  12. "io"
  13. "log"
  14. "os"
  15. "os/signal"
  16. "path/filepath"
  17. "regexp"
  18. "strings"
  19. _ "github.com/mattn/go-sqlite3"
  20. "golang.org/x/term"
  21. )
  22. var (
  23. Version string
  24. Build string
  25. )
  26. const MaxRecords = 1_000_000_000
  27. func main() {
  28. if len(os.Args) != 2 {
  29. log.Printf("%s %s", Version, Build)
  30. log.Printf("usage: %s <csv_path>", os.Args[0])
  31. os.Exit(1)
  32. }
  33. csvPath, err := filepath.Abs(os.Args[1])
  34. if err != nil {
  35. log.Fatal(fmt.Sprintf("resolve absolute path for '%s': %v", os.Args[1], err))
  36. }
  37. ctx, cancel := context.WithCancel(context.Background())
  38. go func() {
  39. _, stop := signal.NotifyContext(
  40. ctx,
  41. os.Interrupt,
  42. os.Kill,
  43. )
  44. sig := make(chan os.Signal, 1)
  45. <-sig
  46. stop()
  47. log.Print("caught signal; shutting down")
  48. cancel()
  49. }()
  50. // open csv for read
  51. f, err := os.Open(os.Args[1])
  52. if err != nil {
  53. log.Fatal(fmt.Sprintf("open csv: %v", err))
  54. }
  55. defer func() {
  56. if err := f.Close(); err != nil {
  57. log.Printf("close csv '%s': %v", csvPath, err)
  58. }
  59. }()
  60. r := csv.NewReader(f)
  61. r.ReuseRecord = true
  62. // open db
  63. tmp := strings.TrimSuffix(csvPath, ".csv")
  64. dbPath := tmp + ".db"
  65. name := scrubName(filepath.Base(tmp))
  66. // no need to rollback or be crash resistant
  67. params := "?_synchronous=0&_journal_mode=OFF&_temp_store=2"
  68. db, err := sql.Open("sqlite3", dbPath+params)
  69. if err != nil {
  70. log.Fatal(fmt.Sprintf("open db '%s': %v", dbPath, err))
  71. }
  72. defer func() {
  73. if err := db.Close(); err != nil {
  74. log.Printf("close db '%s': %v", dbPath, err)
  75. }
  76. }()
  77. // create table, prepare insert
  78. var insert *sql.Stmt
  79. headers := make([]string, 0, 32)
  80. rec, err := r.Read()
  81. if err != nil && err != io.EOF {
  82. log.Fatal(fmt.Sprintf("read csv '%s': %v", csvPath, err))
  83. }
  84. for _, f := range rec {
  85. f = scrubName(f)
  86. headers = append(headers, f)
  87. }
  88. _, err = createTable(ctx, db, name, headers)
  89. if err != nil {
  90. log.Fatal(fmt.Sprintf("create table '%s': %v", name, err))
  91. }
  92. insert, err = genInsert(ctx, db, name, headers)
  93. if err != nil {
  94. log.Fatal(fmt.Sprintf("prepare insert: %v", err))
  95. }
  96. defer func() {
  97. if err := insert.Close(); err != nil {
  98. log.Printf("close prepared insert: %v", err)
  99. }
  100. }()
  101. // insert records
  102. tx, err := db.BeginTx(ctx, nil)
  103. if err != nil {
  104. log.Fatal(fmt.Sprintf("begin transaction: %v", err))
  105. }
  106. i := 0
  107. for {
  108. i++
  109. if i >= MaxRecords {
  110. panic(fmt.Sprintf("BUG: max record limit reached: %d", MaxRecords))
  111. }
  112. if ctx.Err() != nil {
  113. break
  114. }
  115. // Print count for every thousand-ish records
  116. if i&4095 == 0 {
  117. printStatus(i)
  118. }
  119. if i&65535 == 0 {
  120. if err := tx.Commit(); err != nil {
  121. log.Fatal(fmt.Printf("commit transaction: %v", err))
  122. }
  123. tx, err = db.BeginTx(ctx, nil)
  124. if err != nil {
  125. log.Fatal(fmt.Sprintf("begin transaction: %v", err))
  126. }
  127. }
  128. rec, err := r.Read()
  129. if err != nil {
  130. fmt.Println()
  131. if err == io.EOF {
  132. log.Printf("read %d records", i)
  133. } else {
  134. log.Printf("read csv '%s': %v", csvPath, err)
  135. }
  136. break
  137. }
  138. // Rather slow, doing this for every record, but since
  139. // we cannot know how many fields will exist ahead of
  140. // time (we compile *now*, not after we've read the
  141. // headers), we cannot simply enter, say, 27 arguments,
  142. // each converted to `any`, by hand. It may be faster to
  143. // simply print the SQL statements and pipe them to
  144. // sqlite. Should check this.
  145. args := make([]any, len(rec))
  146. for j := 0; j < len(rec); j++ {
  147. args[j] = any(rec[j])
  148. }
  149. _, err = insert.ExecContext(ctx, args...)
  150. if err != nil {
  151. log.Fatal(fmt.Sprintf("insert record '%#v': %v", rec, err))
  152. }
  153. }
  154. if err := tx.Commit(); err != nil {
  155. log.Fatal(fmt.Sprintf("commit transaction: %v", err))
  156. }
  157. }
  158. func printStatus(n int) {
  159. if term.IsTerminal(int(os.Stdout.Fd())) {
  160. fmt.Printf("\x1b[%dG", 1)
  161. fmt.Printf("%d", n)
  162. }
  163. }
  164. func genInsert(
  165. ctx context.Context,
  166. db *sql.DB,
  167. name string,
  168. headers []string,
  169. ) (*sql.Stmt, error) {
  170. var b strings.Builder
  171. b.WriteString(fmt.Sprintf("INSERT INTO %s (", name))
  172. for i, h := range headers {
  173. if i != 0 {
  174. b.WriteString(", ")
  175. }
  176. b.WriteString(h)
  177. }
  178. b.WriteString(") VALUES (")
  179. for i := 0; i < len(headers); i++ {
  180. if i != 0 {
  181. b.WriteString(", ")
  182. }
  183. b.WriteString(fmt.Sprintf("$%d", i+1))
  184. }
  185. b.WriteString(");")
  186. log.Printf("debug: prepare insert: %s\n", b.String())
  187. return db.PrepareContext(ctx, b.String())
  188. }
  189. func createTable(
  190. ctx context.Context,
  191. db *sql.DB,
  192. name string,
  193. headers []string,
  194. ) (sql.Result, error) {
  195. var b strings.Builder
  196. b.WriteString(
  197. fmt.Sprintf("CREATE TABLE IF NOT EXISTS %s (", name),
  198. )
  199. for i, h := range headers {
  200. if i != 0 {
  201. b.WriteString(", ")
  202. }
  203. b.WriteString(fmt.Sprintf("%s text", h))
  204. }
  205. b.WriteString(");")
  206. return db.ExecContext(ctx, b.String())
  207. }
  208. var badRunes = regexp.MustCompile(`[^a-zA-Z0-9_\-\.]+`)
  209. var badFirst = regexp.MustCompile(`^[^a-zA-Z]+`)
  210. var parens = regexp.MustCompile(`\([^\)]*\)`)
  211. var spaces = regexp.MustCompile(`\s+`)
  212. var hyphens = regexp.MustCompile(`\-+`)
  213. var unders = regexp.MustCompile(`_+`)
  214. func scrubName(s string) string {
  215. s = strings.ToLower(s)
  216. s = badFirst.ReplaceAllLiteralString(s, "")
  217. s = parens.ReplaceAllLiteralString(s, "")
  218. s = strings.TrimSpace(s)
  219. s = spaces.ReplaceAllLiteralString(s, "_")
  220. s = badRunes.ReplaceAllLiteralString(s, "_")
  221. s = hyphens.ReplaceAllLiteralString(s, "_")
  222. s = unders.ReplaceAllLiteralString(s, "_")
  223. s = strings.TrimSuffix(s, "_")
  224. return s
  225. }