123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239 |
- package main
- import (
- "context"
- "database/sql"
- "encoding/csv"
- "fmt"
- "io"
- "log"
- "os"
- "os/signal"
- "path/filepath"
- "regexp"
- "strings"
- _ "github.com/mattn/go-sqlite3"
- "golang.org/x/term"
- )
- var MaxRecords = 1_000_000_000
- func main() {
- if len(os.Args) != 2 {
- log.Printf("usage: %s <csv_path>", os.Args[0])
- os.Exit(1)
- }
- csvPath, err := filepath.Abs(os.Args[1])
- if err != nil {
- log.Fatal(fmt.Sprintf("resolve absolute path for '%s': %v", os.Args[1], err))
- }
- ctx, cancel := context.WithCancel(context.Background())
- go func() {
- _, stop := signal.NotifyContext(
- ctx,
- os.Interrupt,
- os.Kill,
- )
- sig := make(chan os.Signal, 1)
- <-sig
- stop()
- log.Print("caught signal; shutting down")
- cancel()
- }()
- // open csv for read
- f, err := os.Open(os.Args[1])
- if err != nil {
- log.Fatal(fmt.Sprintf("open csv: %v", err))
- }
- defer func() {
- if err := f.Close(); err != nil {
- log.Printf("close csv '%s': %v", csvPath, err)
- }
- }()
- r := csv.NewReader(f)
- r.ReuseRecord = true
- // open db
- tmp := strings.TrimSuffix(csvPath, ".csv")
- dbPath := tmp + ".db"
- name := scrubName(filepath.Base(tmp))
- // no need to rollback or be crash resistant
- params := "?_synchronous=0&_journal_mode=OFF&_temp_store=2"
- db, err := sql.Open("sqlite3", dbPath+params)
- if err != nil {
- log.Fatal(fmt.Sprintf("open db '%s': %v", dbPath, err))
- }
- defer func() {
- if err := db.Close(); err != nil {
- log.Printf("close db '%s': %v", dbPath, err)
- }
- }()
- // create table, prepare insert
- var insert *sql.Stmt
- headers := make([]string, 0, 32)
- rec, err := r.Read()
- if err != nil && err != io.EOF {
- log.Fatal(fmt.Sprintf("read csv '%s': %v", csvPath, err))
- }
- for _, f := range rec {
- f = scrubName(f)
- headers = append(headers, f)
- }
- _, err = createTable(ctx, db, name, headers)
- if err != nil {
- log.Fatal(fmt.Sprintf("create table '%s': %v", name, err))
- }
- insert, err = genInsert(ctx, db, name, headers)
- if err != nil {
- log.Fatal(fmt.Sprintf("prepare insert: %v", err))
- }
- defer func() {
- if err := insert.Close(); err != nil {
- log.Printf("close prepared insert: %v", err)
- }
- }()
- // insert records
- tx, err := db.BeginTx(ctx, nil)
- if err != nil {
- log.Fatal(fmt.Sprintf("begin transaction: %v", err))
- }
- i := 0
- for {
- i++
- if i >= MaxRecords {
- panic(fmt.Sprintf("BUG: max record limit reached: %d", MaxRecords))
- }
- if ctx.Err() != nil {
- break
- }
- // Print count for every thousand-ish records
- if i&4095 == 0 {
- printStatus(i)
- }
- if i&65535 == 0 {
- if err := tx.Commit(); err != nil {
- log.Fatal(fmt.Printf("commit transaction: %v", err))
- }
- tx, err = db.BeginTx(ctx, nil)
- if err != nil {
- log.Fatal(fmt.Sprintf("begin transaction: %v", err))
- }
- }
- rec, err := r.Read()
- if err != nil {
- fmt.Println()
- if err == io.EOF {
- log.Printf("read %d records", i)
- } else {
- log.Printf("read csv '%s': %v", csvPath, err)
- }
- break
- }
- // Rather slow, doing this for every record, but since
- // we cannot know how many fields will exist ahead of
- // time (we compile *now*, not after we've read the
- // headers), we cannot simply enter, say, 27 arguments,
- // each converted to `any`, by hand. It may be faster to
- // simply print the SQL statements and pipe them to
- // sqlite. Should check this.
- args := make([]any, len(rec))
- for j := 0; j < len(rec); j++ {
- args[j] = any(rec[j])
- }
- _, err = insert.ExecContext(ctx, args...)
- if err != nil {
- log.Fatal(fmt.Sprintf("insert record '%#v': %v", rec, err))
- }
- }
- if err := tx.Commit(); err != nil {
- log.Fatal(fmt.Sprintf("commit transaction: %v", err))
- }
- }
- func printStatus(n int) {
- if term.IsTerminal(int(os.Stdout.Fd())) {
- fmt.Printf("\x1b[%dG", 1)
- fmt.Printf("%d", n)
- }
- }
- func genInsert(
- ctx context.Context,
- db *sql.DB,
- name string,
- headers []string,
- ) (*sql.Stmt, error) {
- var b strings.Builder
- b.WriteString(fmt.Sprintf("INSERT INTO %s (", name))
- for i, h := range headers {
- if i != 0 {
- b.WriteString(", ")
- }
- b.WriteString(h)
- }
- b.WriteString(") VALUES (")
- for i := 0; i < len(headers); i++ {
- if i != 0 {
- b.WriteString(", ")
- }
- b.WriteString(fmt.Sprintf("$%d", i+1))
- }
- b.WriteString(");")
- log.Printf("debug: prepare insert: %s\n", b.String())
- return db.PrepareContext(ctx, b.String())
- }
- func createTable(
- ctx context.Context,
- db *sql.DB,
- name string,
- headers []string,
- ) (sql.Result, error) {
- var b strings.Builder
- b.WriteString(
- fmt.Sprintf("CREATE TABLE IF NOT EXISTS %s (", name),
- )
- for i, h := range headers {
- if i != 0 {
- b.WriteString(", ")
- }
- b.WriteString(fmt.Sprintf("%s text", h))
- }
- b.WriteString(");")
- return db.ExecContext(ctx, b.String())
- }
- var badRunes = regexp.MustCompile(`[^a-zA-Z0-9_\-\.]+`)
- var badFirst = regexp.MustCompile(`^[^a-zA-Z]+`)
- var parens = regexp.MustCompile(`\([^\)]*\)`)
- var spaces = regexp.MustCompile(`\s+`)
- var hyphens = regexp.MustCompile(`\-+`)
- var unders = regexp.MustCompile(`_+`)
- func scrubName(s string) string {
- s = strings.ToLower(s)
- s = badFirst.ReplaceAllLiteralString(s, "")
- s = parens.ReplaceAllLiteralString(s, "")
- s = strings.TrimSpace(s)
- s = spaces.ReplaceAllLiteralString(s, "_")
- s = badRunes.ReplaceAllLiteralString(s, "_")
- s = hyphens.ReplaceAllLiteralString(s, "_")
- s = unders.ReplaceAllLiteralString(s, "_")
- s = strings.TrimSuffix(s, "_")
- return s
- }
|