main.go 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275
  1. /*
  2. * This Source Code Form is subject to the terms of the Mozilla Public
  3. * License, v. 2.0. If a copy of the MPL was not distributed with this
  4. * file, You can obtain one at https://mozilla.org/MPL/2.0/.
  5. */
  6. package main
  7. import (
  8. "context"
  9. "database/sql"
  10. "encoding/csv"
  11. "errors"
  12. "fmt"
  13. "io"
  14. "os"
  15. "os/signal"
  16. "path/filepath"
  17. "regexp"
  18. "strings"
  19. "time"
  20. "github.com/mattn/go-sqlite3"
  21. _ "github.com/mattn/go-sqlite3"
  22. logger "idio.link/go/logger/v3"
  23. "golang.org/x/term"
  24. )
  25. var (
  26. Version, Build string
  27. )
  28. const MaxRecords = 1_000_000_000
  29. const SQLITE_CONSTRAINT_PRIMARYKEY = 1555
  30. func main() {
  31. log := logger.NewLogger()
  32. if len(os.Args) != 2 {
  33. if Build == "" {
  34. Version = "dev"
  35. Build = time.Now().UTC().Format(time.RFC3339)
  36. }
  37. fmt.Printf("%s %s\n", Version, Build)
  38. fmt.Printf("usage: %s <csv_path>\n", os.Args[0])
  39. os.Exit(1)
  40. }
  41. csvPath, err := filepath.Abs(os.Args[1])
  42. if err != nil {
  43. log.Fatal("resolve absolute path for '%s': %v", os.Args[1], err)
  44. }
  45. ctx := context.WithValue(context.Background(), "log", log)
  46. ctx, cancel := context.WithCancel(ctx)
  47. go func() {
  48. _, stop := signal.NotifyContext(
  49. ctx,
  50. os.Interrupt,
  51. os.Kill,
  52. )
  53. sig := make(chan os.Signal, 1)
  54. <-sig
  55. stop()
  56. log.Info("caught signal; shutting down")
  57. cancel()
  58. }()
  59. // open csv for read
  60. f, err := os.Open(os.Args[1])
  61. if err != nil {
  62. log.Fatal("open csv: %v", err)
  63. }
  64. defer func() {
  65. if err := f.Close(); err != nil {
  66. log.Error("close csv '%s': %v", csvPath, err)
  67. }
  68. }()
  69. r := csv.NewReader(f)
  70. r.ReuseRecord = true
  71. // open db
  72. tmp := strings.TrimSuffix(csvPath, ".csv")
  73. dbPath := tmp + ".db"
  74. name := scrubName(filepath.Base(tmp))
  75. // no need to rollback or be crash resistant
  76. params := "?_synchronous=0&_journal_mode=OFF&_temp_store=2"
  77. db, err := sql.Open("sqlite3", dbPath+params)
  78. if err != nil {
  79. log.Fatal("open db '%s': %v", dbPath, err)
  80. }
  81. defer func() {
  82. if err := db.Close(); err != nil {
  83. log.Error("close db '%s': %v", dbPath, err)
  84. }
  85. }()
  86. // create table, prepare insert
  87. var insert *sql.Stmt
  88. headers := make([]string, 0, 32)
  89. rec, err := r.Read()
  90. if err != nil && err != io.EOF {
  91. log.Fatal("read csv '%s': %v", csvPath, err)
  92. }
  93. for _, f := range rec {
  94. f = scrubName(f)
  95. headers = append(headers, f)
  96. }
  97. _, err = createTable(ctx, db, name, headers)
  98. if err != nil {
  99. log.Fatal("create table '%s': %v", name, err)
  100. }
  101. insert, err = genInsert(ctx, db, name, headers)
  102. if err != nil {
  103. log.Fatal("prepare insert: %v", err)
  104. }
  105. defer func() {
  106. if err := insert.Close(); err != nil {
  107. log.Error("close prepared insert: %v", err)
  108. }
  109. }()
  110. // insert records
  111. tx, err := db.BeginTx(ctx, nil)
  112. if err != nil {
  113. log.Fatal("begin transaction: %v", err)
  114. }
  115. nonUnique := 0
  116. i := 0
  117. for {
  118. i++
  119. if i >= MaxRecords {
  120. panic(fmt.Sprintf("BUG: max record limit reached: %d", MaxRecords))
  121. }
  122. if ctx.Err() != nil {
  123. break
  124. }
  125. // Print count for every thousand-ish records
  126. if i&4095 == 0 {
  127. printStatus(i)
  128. }
  129. if i&65535 == 0 {
  130. if err := tx.Commit(); err != nil {
  131. log.Fatal("commit transaction: %v", err)
  132. }
  133. tx, err = db.BeginTx(ctx, nil)
  134. if err != nil {
  135. log.Fatal("begin transaction: %v", err)
  136. }
  137. }
  138. rec, err := r.Read()
  139. if err != nil {
  140. fmt.Println()
  141. if err == io.EOF {
  142. fmt.Fprintf(os.Stderr, "read %d records\n", i)
  143. fmt.Fprintf(os.Stderr, "skipped %d redundant records\n", nonUnique)
  144. } else {
  145. log.Error("read csv '%s': %v", csvPath, err)
  146. }
  147. break
  148. }
  149. // Rather slow, doing this for every record, but since
  150. // we cannot know how many fields will exist ahead of
  151. // time (we compile *now*, not after we've read the
  152. // headers), we cannot simply enter, say, 27 arguments,
  153. // each converted to `any`, by hand. It may be faster to
  154. // simply print the SQL statements and pipe them to
  155. // sqlite. Should check this.
  156. args := make([]any, len(rec))
  157. for j := 0; j < len(rec); j++ {
  158. args[j] = any(rec[j])
  159. }
  160. // For explanation of extended codes, see https://www.sqlite.org/rescode.html
  161. _, err = insert.ExecContext(ctx, args...)
  162. if err != nil {
  163. var sqliteErr sqlite3.Error
  164. if errors.As(err, &sqliteErr) &&
  165. sqliteErr.ExtendedCode == SQLITE_CONSTRAINT_PRIMARYKEY { // primary key
  166. nonUnique++
  167. log.Debug("insert record '%#v': %v", rec, err)
  168. } else {
  169. log.Fatal("insert record '%#v': %v", rec, err)
  170. }
  171. }
  172. }
  173. if err := tx.Commit(); err != nil {
  174. log.Fatal("commit transaction: %v", err)
  175. }
  176. }
  177. func printStatus(n int) {
  178. if term.IsTerminal(int(os.Stdout.Fd())) {
  179. fmt.Printf("\x1b[%dG", 1)
  180. fmt.Printf("%d", n)
  181. }
  182. }
  183. func genInsert(
  184. ctx context.Context,
  185. db *sql.DB,
  186. name string,
  187. headers []string,
  188. ) (*sql.Stmt, error) {
  189. log := ctx.Value("log").(*logger.Logger)
  190. var b strings.Builder
  191. b.WriteString(fmt.Sprintf("INSERT INTO %s (", name))
  192. for i, h := range headers {
  193. if i != 0 {
  194. b.WriteString(", ")
  195. }
  196. b.WriteString(h)
  197. }
  198. b.WriteString(") VALUES (")
  199. for i := 0; i < len(headers); i++ {
  200. if i != 0 {
  201. b.WriteString(", ")
  202. }
  203. b.WriteString(fmt.Sprintf("$%d", i+1))
  204. }
  205. b.WriteString(");")
  206. log.Debug("debug: prepare insert: %s\n", b.String())
  207. return db.PrepareContext(ctx, b.String())
  208. }
  209. func createTable(
  210. ctx context.Context,
  211. db *sql.DB,
  212. name string,
  213. headers []string,
  214. ) (sql.Result, error) {
  215. var b strings.Builder
  216. b.WriteString(
  217. fmt.Sprintf("CREATE TABLE IF NOT EXISTS %s (", name),
  218. )
  219. for i, h := range headers {
  220. if i != 0 {
  221. b.WriteString(", ")
  222. }
  223. b.WriteString(fmt.Sprintf("%s text", h))
  224. }
  225. b.WriteString(");")
  226. fmt.Fprintf(os.Stderr, "%s\n", b.String())
  227. return db.ExecContext(ctx, b.String())
  228. }
  229. var badRunes = regexp.MustCompile(`[^a-zA-Z0-9_\-\.]+`)
  230. var badFirst = regexp.MustCompile(`^[^a-zA-Z]+`)
  231. var parens = regexp.MustCompile(`\([^\)]*\)`)
  232. var spaces = regexp.MustCompile(`\s+`)
  233. var hyphens = regexp.MustCompile(`\-+`)
  234. var unders = regexp.MustCompile(`_+`)
  235. func scrubName(s string) string {
  236. s = strings.ToLower(s)
  237. s = badFirst.ReplaceAllLiteralString(s, "")
  238. s = parens.ReplaceAllLiteralString(s, "")
  239. s = strings.TrimSpace(s)
  240. s = spaces.ReplaceAllLiteralString(s, "_")
  241. s = badRunes.ReplaceAllLiteralString(s, "_")
  242. s = hyphens.ReplaceAllLiteralString(s, "_")
  243. s = unders.ReplaceAllLiteralString(s, "_")
  244. s = strings.TrimSuffix(s, "_")
  245. return fmt.Sprintf("'%s'", s)
  246. }