main.go 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323
  1. /*
  2. * This Source Code Form is subject to the terms of the Mozilla Public
  3. * License, v. 2.0. If a copy of the MPL was not distributed with this
  4. * file, You can obtain one at https://mozilla.org/MPL/2.0/.
  5. */
  6. package main
  7. import (
  8. "context"
  9. "database/sql"
  10. "encoding/csv"
  11. "errors"
  12. "flag"
  13. "fmt"
  14. "io"
  15. "os"
  16. "os/signal"
  17. "path/filepath"
  18. "regexp"
  19. "strings"
  20. "time"
  21. "github.com/mattn/go-sqlite3"
  22. _ "github.com/mattn/go-sqlite3"
  23. logger "idio.link/go/logger/v3"
  24. "golang.org/x/term"
  25. )
  26. var (
  27. Version, Build string
  28. WithTimestamp bool
  29. )
  30. const (
  31. MaxRecords = 1_000_000_000
  32. SQLITE_CONSTRAINT_PRIMARYKEY = 1555
  33. )
  34. func main() {
  35. log := logger.NewLogger()
  36. flag.BoolVar(
  37. &WithTimestamp,
  38. "timestamp",
  39. false,
  40. "Add a column 'timestamp' to imported rows. "+
  41. "All rows receive the same value for 'timestamp', "+
  42. "calculated at the start of the import. "+
  43. "",
  44. )
  45. flag.Parse()
  46. if len(flag.Args()) != 1 {
  47. if Build == "" {
  48. Version = "dev"
  49. Build = time.Now().UTC().Format(time.RFC3339)
  50. }
  51. fmt.Printf("%s %s\n", Version, Build)
  52. fmt.Printf("usage: %s [--timestamp] <csv_path>\n", os.Args[0])
  53. os.Exit(1)
  54. }
  55. csvPath, err := filepath.Abs(flag.Args()[0])
  56. if err != nil {
  57. log.Fatal("resolve absolute path for '%s': %v", flag.Args()[0], err)
  58. }
  59. ctx := context.WithValue(context.Background(), "log", log)
  60. ctx, cancel := context.WithCancel(ctx)
  61. go func() {
  62. _, stop := signal.NotifyContext(
  63. ctx,
  64. os.Interrupt,
  65. os.Kill,
  66. )
  67. sig := make(chan os.Signal, 1)
  68. <-sig
  69. stop()
  70. log.Info("caught signal; shutting down")
  71. cancel()
  72. }()
  73. // open csv for read
  74. f, err := os.Open(csvPath)
  75. if err != nil {
  76. log.Fatal("open csv: %v", err)
  77. }
  78. defer func() {
  79. if err := f.Close(); err != nil {
  80. log.Error("close csv '%s': %v", csvPath, err)
  81. }
  82. }()
  83. r := csv.NewReader(f)
  84. r.ReuseRecord = true
  85. // open db
  86. tmp := strings.TrimSuffix(csvPath, ".csv")
  87. dbPath := tmp + ".db"
  88. name := scrubName(filepath.Base(tmp))
  89. // no need to rollback or be crash resistant
  90. params := "?_synchronous=0&_journal_mode=OFF&_temp_store=2"
  91. db, err := sql.Open("sqlite3", dbPath+params)
  92. if err != nil {
  93. log.Fatal("open db '%s': %v", dbPath, err)
  94. }
  95. defer func() {
  96. if err := db.Close(); err != nil {
  97. log.Error("close db '%s': %v", dbPath, err)
  98. }
  99. }()
  100. // Read headers
  101. rec, err := r.Read()
  102. if err != nil && err != io.EOF {
  103. log.Fatal("read csv '%s': %v", csvPath, err)
  104. }
  105. // Prepare args with timestamp, as necessary.
  106. argsLen := len(rec)
  107. if WithTimestamp {
  108. argsLen++
  109. }
  110. args := make([]any, argsLen)
  111. if WithTimestamp {
  112. args[0] = any(FormatRFC3339Subsec(time.Now().UTC()))
  113. }
  114. // create table, prepare insert
  115. var insert *sql.Stmt
  116. headers := make([]string, 0, argsLen)
  117. if WithTimestamp {
  118. headers = append(headers, "timestamp")
  119. }
  120. for _, f := range rec {
  121. f = scrubName(f)
  122. headers = append(headers, f)
  123. }
  124. _, err = createTable(ctx, db, name, headers)
  125. if err != nil {
  126. log.Fatal("create table '%s': %v", name, err)
  127. }
  128. insert, err = genInsert(ctx, db, name, headers)
  129. if err != nil {
  130. log.Fatal("prepare insert: %v", err)
  131. }
  132. defer func() {
  133. if err := insert.Close(); err != nil {
  134. log.Error("close prepared insert: %v", err)
  135. }
  136. }()
  137. // insert records
  138. tx, err := db.BeginTx(ctx, nil)
  139. if err != nil {
  140. log.Fatal("begin transaction: %v", err)
  141. }
  142. nonUnique := 0
  143. i := 0
  144. for {
  145. i++
  146. if i >= MaxRecords {
  147. panic(fmt.Sprintf("BUG: max record limit reached: %d", MaxRecords))
  148. }
  149. if ctx.Err() != nil {
  150. break
  151. }
  152. // Print count for every thousand-ish records
  153. if i&4095 == 0 {
  154. printStatus(i)
  155. }
  156. if i&65535 == 0 {
  157. if err := tx.Commit(); err != nil {
  158. log.Fatal("commit transaction: %v", err)
  159. }
  160. tx, err = db.BeginTx(ctx, nil)
  161. if err != nil {
  162. log.Fatal("begin transaction: %v", err)
  163. }
  164. }
  165. rec, err := r.Read()
  166. if err != nil {
  167. fmt.Println()
  168. if err == io.EOF {
  169. fmt.Fprintf(os.Stderr, "read %d records\n", i)
  170. fmt.Fprintf(os.Stderr, "skipped %d redundant records\n", nonUnique)
  171. } else {
  172. log.Error("read csv '%s': %v", csvPath, err)
  173. }
  174. break
  175. }
  176. // Rather slow, doing this for every record, but since
  177. // we cannot know how many fields will exist ahead of
  178. // time (we compile *now*, not after we've read the
  179. // headers), we cannot simply enter, say, 27 arguments,
  180. // each converted to `any`, by hand. It may be faster to
  181. // simply print the SQL statements and pipe them to
  182. // sqlite. Should check this.
  183. recArgs := args[:]
  184. if WithTimestamp {
  185. // Skip over timestamp
  186. recArgs = args[1:]
  187. }
  188. for j := 0; j < len(recArgs); j++ {
  189. recArgs[j] = any(rec[j])
  190. }
  191. // For explanation of extended codes, see https://www.sqlite.org/rescode.html
  192. _, err = insert.ExecContext(ctx, args...)
  193. if err != nil {
  194. var sqliteErr sqlite3.Error
  195. if errors.As(err, &sqliteErr) &&
  196. sqliteErr.ExtendedCode == SQLITE_CONSTRAINT_PRIMARYKEY {
  197. nonUnique++
  198. log.Debug("insert record '%#v': %v", rec, err)
  199. } else {
  200. log.Fatal("insert record '%#v': %v", rec, err)
  201. }
  202. }
  203. }
  204. if err := tx.Commit(); err != nil {
  205. log.Fatal("commit transaction: %v", err)
  206. }
  207. }
  208. func printStatus(n int) {
  209. if term.IsTerminal(int(os.Stdout.Fd())) {
  210. fmt.Printf("\x1b[%dG", 1)
  211. fmt.Printf("%d", n)
  212. }
  213. }
  214. func genInsert(
  215. ctx context.Context,
  216. db *sql.DB,
  217. name string,
  218. headers []string,
  219. ) (*sql.Stmt, error) {
  220. log := ctx.Value("log").(*logger.Logger)
  221. var b strings.Builder
  222. b.WriteString(fmt.Sprintf("INSERT INTO %s (", name))
  223. for i, h := range headers {
  224. if i != 0 {
  225. b.WriteString(", ")
  226. }
  227. b.WriteString(h)
  228. }
  229. b.WriteString(") VALUES (")
  230. for i := 0; i < len(headers); i++ {
  231. if i != 0 {
  232. b.WriteString(", ")
  233. }
  234. b.WriteString(fmt.Sprintf("$%d", i+1))
  235. }
  236. b.WriteString(");")
  237. log.Debug("debug: prepare insert: %s\n", b.String())
  238. return db.PrepareContext(ctx, b.String())
  239. }
  240. func createTable(
  241. ctx context.Context,
  242. db *sql.DB,
  243. name string,
  244. headers []string,
  245. ) (sql.Result, error) {
  246. var b strings.Builder
  247. b.WriteString(
  248. fmt.Sprintf("CREATE TABLE IF NOT EXISTS %s (", name),
  249. )
  250. for i, h := range headers {
  251. if i != 0 {
  252. b.WriteString(", ")
  253. }
  254. b.WriteString(fmt.Sprintf("%s text", h))
  255. }
  256. b.WriteString(");")
  257. fmt.Fprintf(os.Stderr, "%s\n", b.String())
  258. return db.ExecContext(ctx, b.String())
  259. }
  260. var badRunes = regexp.MustCompile(`[^a-zA-Z0-9_\-\.]+`)
  261. var badFirst = regexp.MustCompile(`^[^a-zA-Z]+`)
  262. var parens = regexp.MustCompile(`[\(\)]`)
  263. var spaces = regexp.MustCompile(`\s+`)
  264. var hyphens = regexp.MustCompile(`\-+`)
  265. var unders = regexp.MustCompile(`_+`)
  266. func scrubName(s string) string {
  267. s = strings.ToLower(s)
  268. s = badFirst.ReplaceAllLiteralString(s, "")
  269. s = parens.ReplaceAllLiteralString(s, "")
  270. s = strings.TrimSpace(s)
  271. s = spaces.ReplaceAllLiteralString(s, "_")
  272. s = badRunes.ReplaceAllLiteralString(s, "_")
  273. s = hyphens.ReplaceAllLiteralString(s, "_")
  274. s = unders.ReplaceAllLiteralString(s, "_")
  275. s = strings.TrimSuffix(s, "_")
  276. return fmt.Sprintf("'%s'", s)
  277. }
  278. func FormatRFC3339Subsec(t time.Time) string {
  279. // RFC3339Nano = "2006-01-02T15:04:05.999999999Z07:00"
  280. rfc3339 := t.Format(time.RFC3339Nano)
  281. datetime := rfc3339[:19]
  282. nano := rfc3339[20:23] // Just ms
  283. return fmt.Sprintf(
  284. "%s.%sZ",
  285. datetime,
  286. nano,
  287. )
  288. }