main.go 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261
  1. /*
  2. * This Source Code Form is subject to the terms of the Mozilla Public
  3. * License, v. 2.0. If a copy of the MPL was not distributed with this
  4. * file, You can obtain one at https://mozilla.org/MPL/2.0/.
  5. */
  6. package main
  7. import (
  8. "context"
  9. "database/sql"
  10. "encoding/csv"
  11. "fmt"
  12. "io"
  13. "os"
  14. "os/signal"
  15. "path/filepath"
  16. "regexp"
  17. "strings"
  18. "time"
  19. _ "github.com/mattn/go-sqlite3"
  20. logger "idio.link/go/logger/v3"
  21. "golang.org/x/term"
  22. )
  23. var (
  24. Version, Build string
  25. )
  26. const MaxRecords = 1_000_000_000
  27. func main() {
  28. log := logger.NewLogger()
  29. if len(os.Args) != 2 {
  30. if Build == "" {
  31. Version = "dev"
  32. Build = time.Now().UTC().Format(time.RFC3339)
  33. }
  34. fmt.Printf("%s %s\n", Version, Build)
  35. fmt.Printf("usage: %s <csv_path>\n", os.Args[0])
  36. os.Exit(1)
  37. }
  38. csvPath, err := filepath.Abs(os.Args[1])
  39. if err != nil {
  40. log.Fatal("resolve absolute path for '%s': %v", os.Args[1], err)
  41. }
  42. ctx := context.WithValue(context.Background(), "log", log)
  43. ctx, cancel := context.WithCancel(ctx)
  44. go func() {
  45. _, stop := signal.NotifyContext(
  46. ctx,
  47. os.Interrupt,
  48. os.Kill,
  49. )
  50. sig := make(chan os.Signal, 1)
  51. <-sig
  52. stop()
  53. log.Info("caught signal; shutting down")
  54. cancel()
  55. }()
  56. // open csv for read
  57. f, err := os.Open(os.Args[1])
  58. if err != nil {
  59. log.Fatal("open csv: %v", err)
  60. }
  61. defer func() {
  62. if err := f.Close(); err != nil {
  63. log.Error("close csv '%s': %v", csvPath, err)
  64. }
  65. }()
  66. r := csv.NewReader(f)
  67. r.ReuseRecord = true
  68. // open db
  69. tmp := strings.TrimSuffix(csvPath, ".csv")
  70. dbPath := tmp + ".db"
  71. name := scrubName(filepath.Base(tmp))
  72. // no need to rollback or be crash resistant
  73. params := "?_synchronous=0&_journal_mode=OFF&_temp_store=2"
  74. db, err := sql.Open("sqlite3", dbPath+params)
  75. if err != nil {
  76. log.Fatal("open db '%s': %v", dbPath, err)
  77. }
  78. defer func() {
  79. if err := db.Close(); err != nil {
  80. log.Error("close db '%s': %v", dbPath, err)
  81. }
  82. }()
  83. // create table, prepare insert
  84. var insert *sql.Stmt
  85. headers := make([]string, 0, 32)
  86. rec, err := r.Read()
  87. if err != nil && err != io.EOF {
  88. log.Fatal("read csv '%s': %v", csvPath, err)
  89. }
  90. for _, f := range rec {
  91. f = scrubName(f)
  92. headers = append(headers, f)
  93. }
  94. _, err = createTable(ctx, db, name, headers)
  95. if err != nil {
  96. log.Fatal("create table '%s': %v", name, err)
  97. }
  98. insert, err = genInsert(ctx, db, name, headers)
  99. if err != nil {
  100. log.Fatal("prepare insert: %v", err)
  101. }
  102. defer func() {
  103. if err := insert.Close(); err != nil {
  104. log.Error("close prepared insert: %v", err)
  105. }
  106. }()
  107. // insert records
  108. tx, err := db.BeginTx(ctx, nil)
  109. if err != nil {
  110. log.Fatal("begin transaction: %v", err)
  111. }
  112. i := 0
  113. for {
  114. i++
  115. if i >= MaxRecords {
  116. panic(fmt.Sprintf("BUG: max record limit reached: %d", MaxRecords))
  117. }
  118. if ctx.Err() != nil {
  119. break
  120. }
  121. // Print count for every thousand-ish records
  122. if i&4095 == 0 {
  123. printStatus(i)
  124. }
  125. if i&65535 == 0 {
  126. if err := tx.Commit(); err != nil {
  127. log.Fatal("commit transaction: %v", err)
  128. }
  129. tx, err = db.BeginTx(ctx, nil)
  130. if err != nil {
  131. log.Fatal("begin transaction: %v", err)
  132. }
  133. }
  134. rec, err := r.Read()
  135. if err != nil {
  136. fmt.Println()
  137. if err == io.EOF {
  138. fmt.Fprintf(os.Stderr, "read %d records\n", i)
  139. } else {
  140. log.Error("read csv '%s': %v", csvPath, err)
  141. }
  142. break
  143. }
  144. // Rather slow, doing this for every record, but since
  145. // we cannot know how many fields will exist ahead of
  146. // time (we compile *now*, not after we've read the
  147. // headers), we cannot simply enter, say, 27 arguments,
  148. // each converted to `any`, by hand. It may be faster to
  149. // simply print the SQL statements and pipe them to
  150. // sqlite. Should check this.
  151. args := make([]any, len(rec))
  152. for j := 0; j < len(rec); j++ {
  153. args[j] = any(rec[j])
  154. }
  155. _, err = insert.ExecContext(ctx, args...)
  156. if err != nil {
  157. log.Fatal("insert record '%#v': %v", rec, err)
  158. }
  159. }
  160. if err := tx.Commit(); err != nil {
  161. log.Fatal("commit transaction: %v", err)
  162. }
  163. }
  164. func printStatus(n int) {
  165. if term.IsTerminal(int(os.Stdout.Fd())) {
  166. fmt.Printf("\x1b[%dG", 1)
  167. fmt.Printf("%d", n)
  168. }
  169. }
  170. func genInsert(
  171. ctx context.Context,
  172. db *sql.DB,
  173. name string,
  174. headers []string,
  175. ) (*sql.Stmt, error) {
  176. log := ctx.Value("log").(*logger.Logger)
  177. var b strings.Builder
  178. b.WriteString(fmt.Sprintf("INSERT INTO %s (", name))
  179. for i, h := range headers {
  180. if i != 0 {
  181. b.WriteString(", ")
  182. }
  183. b.WriteString(h)
  184. }
  185. b.WriteString(") VALUES (")
  186. for i := 0; i < len(headers); i++ {
  187. if i != 0 {
  188. b.WriteString(", ")
  189. }
  190. b.WriteString(fmt.Sprintf("$%d", i+1))
  191. }
  192. b.WriteString(");")
  193. log.Debug("debug: prepare insert: %s\n", b.String())
  194. return db.PrepareContext(ctx, b.String())
  195. }
  196. func createTable(
  197. ctx context.Context,
  198. db *sql.DB,
  199. name string,
  200. headers []string,
  201. ) (sql.Result, error) {
  202. var b strings.Builder
  203. b.WriteString(
  204. fmt.Sprintf("CREATE TABLE IF NOT EXISTS %s (", name),
  205. )
  206. for i, h := range headers {
  207. if i != 0 {
  208. b.WriteString(", ")
  209. }
  210. b.WriteString(fmt.Sprintf("%s text", h))
  211. }
  212. b.WriteString(");")
  213. fmt.Fprintf(os.Stderr, "%s\n", b.String())
  214. return db.ExecContext(ctx, b.String())
  215. }
  216. var badRunes = regexp.MustCompile(`[^a-zA-Z0-9_\-\.]+`)
  217. var badFirst = regexp.MustCompile(`^[^a-zA-Z]+`)
  218. var parens = regexp.MustCompile(`\([^\)]*\)`)
  219. var spaces = regexp.MustCompile(`\s+`)
  220. var hyphens = regexp.MustCompile(`\-+`)
  221. var unders = regexp.MustCompile(`_+`)
  222. func scrubName(s string) string {
  223. s = strings.ToLower(s)
  224. s = badFirst.ReplaceAllLiteralString(s, "")
  225. s = parens.ReplaceAllLiteralString(s, "")
  226. s = strings.TrimSpace(s)
  227. s = spaces.ReplaceAllLiteralString(s, "_")
  228. s = badRunes.ReplaceAllLiteralString(s, "_")
  229. s = hyphens.ReplaceAllLiteralString(s, "_")
  230. s = unders.ReplaceAllLiteralString(s, "_")
  231. s = strings.TrimSuffix(s, "_")
  232. return fmt.Sprintf("'%s'", s)
  233. }