package main import ( "context" "database/sql" "encoding/csv" "fmt" "io" "log" "os" "os/signal" "path/filepath" "regexp" "strings" _ "github.com/mattn/go-sqlite3" "golang.org/x/term" ) var MaxRecords = 1_000_000_000 func main() { if len(os.Args) != 2 { log.Printf("usage: %s ", os.Args[0]) os.Exit(1) } csvPath, err := filepath.Abs(os.Args[1]) if err != nil { log.Fatal(fmt.Sprintf("resolve absolute path for '%s': %v", os.Args[1], err)) } ctx, cancel := context.WithCancel(context.Background()) go func() { _, stop := signal.NotifyContext( ctx, os.Interrupt, os.Kill, ) sig := make(chan os.Signal, 1) <-sig stop() log.Print("caught signal; shutting down") cancel() }() // open csv for read f, err := os.Open(os.Args[1]) if err != nil { log.Fatal(fmt.Sprintf("open csv: %v", err)) } defer func() { if err := f.Close(); err != nil { log.Printf("close csv '%s': %v", csvPath, err) } }() r := csv.NewReader(f) r.ReuseRecord = true // open db tmp := strings.TrimSuffix(csvPath, ".csv") dbPath := tmp + ".db" name := scrubName(filepath.Base(tmp)) // no need to rollback or be crash resistant params := "?_synchronous=0&_journal_mode=OFF&_temp_store=2" db, err := sql.Open("sqlite3", dbPath+params) if err != nil { log.Fatal(fmt.Sprintf("open db '%s': %v", dbPath, err)) } defer func() { if err := db.Close(); err != nil { log.Printf("close db '%s': %v", dbPath, err) } }() // create table, prepare insert var insert *sql.Stmt headers := make([]string, 0, 32) rec, err := r.Read() if err != nil && err != io.EOF { log.Fatal(fmt.Sprintf("read csv '%s': %v", csvPath, err)) } for _, f := range rec { f = scrubName(f) headers = append(headers, f) } _, err = createTable(ctx, db, name, headers) if err != nil { log.Fatal(fmt.Sprintf("create table '%s': %v", name, err)) } insert, err = genInsert(ctx, db, name, headers) if err != nil { log.Fatal(fmt.Sprintf("prepare insert: %v", err)) } defer func() { if err := insert.Close(); err != nil { log.Printf("close prepared insert: %v", err) } }() // insert records tx, err := db.BeginTx(ctx, nil) if err != nil { log.Fatal(fmt.Sprintf("begin transaction: %v", err)) } i := 0 for { i++ if i >= MaxRecords { panic(fmt.Sprintf("BUG: max record limit reached: %d", MaxRecords)) } if ctx.Err() != nil { break } // Print count for every thousand-ish records if i&4095 == 0 { printStatus(i) } if i&65535 == 0 { if err := tx.Commit(); err != nil { log.Fatal(fmt.Printf("commit transaction: %v", err)) } tx, err = db.BeginTx(ctx, nil) if err != nil { log.Fatal(fmt.Sprintf("begin transaction: %v", err)) } } rec, err := r.Read() if err != nil { fmt.Println() if err == io.EOF { log.Printf("read %d records", i) } else { log.Printf("read csv '%s': %v", csvPath, err) } break } // Rather slow, doing this for every record, but since // we cannot know how many fields will exist ahead of // time (we compile *now*, not after we've read the // headers), we cannot simply enter, say, 27 arguments, // each converted to `any`, by hand. It may be faster to // simply print the SQL statements and pipe them to // sqlite. Should check this. args := make([]any, len(rec)) for j := 0; j < len(rec); j++ { args[j] = any(rec[j]) } _, err = insert.ExecContext(ctx, args...) if err != nil { log.Fatal(fmt.Sprintf("insert record '%#v': %v", rec, err)) } } if err := tx.Commit(); err != nil { log.Fatal(fmt.Sprintf("commit transaction: %v", err)) } } func printStatus(n int) { if term.IsTerminal(int(os.Stdout.Fd())) { fmt.Printf("\x1b[%dG", 1) fmt.Printf("%d", n) } } func genInsert( ctx context.Context, db *sql.DB, name string, headers []string, ) (*sql.Stmt, error) { var b strings.Builder b.WriteString(fmt.Sprintf("INSERT INTO %s (", name)) for i, h := range headers { if i != 0 { b.WriteString(", ") } b.WriteString(h) } b.WriteString(") VALUES (") for i := 0; i < len(headers); i++ { if i != 0 { b.WriteString(", ") } b.WriteString(fmt.Sprintf("$%d", i+1)) } b.WriteString(");") log.Printf("debug: prepare insert: %s\n", b.String()) return db.PrepareContext(ctx, b.String()) } func createTable( ctx context.Context, db *sql.DB, name string, headers []string, ) (sql.Result, error) { var b strings.Builder b.WriteString( fmt.Sprintf("CREATE TABLE IF NOT EXISTS %s (", name), ) for i, h := range headers { if i != 0 { b.WriteString(", ") } b.WriteString(fmt.Sprintf("%s text", h)) } b.WriteString(");") return db.ExecContext(ctx, b.String()) } var badRunes = regexp.MustCompile(`[^a-zA-Z0-9_\-\.]+`) var badFirst = regexp.MustCompile(`^[^a-zA-Z]+`) var parens = regexp.MustCompile(`\([^\)]*\)`) var spaces = regexp.MustCompile(`\s+`) var hyphens = regexp.MustCompile(`\-+`) var unders = regexp.MustCompile(`_+`) func scrubName(s string) string { s = strings.ToLower(s) s = badFirst.ReplaceAllLiteralString(s, "") s = parens.ReplaceAllLiteralString(s, "") s = strings.TrimSpace(s) s = spaces.ReplaceAllLiteralString(s, "_") s = badRunes.ReplaceAllLiteralString(s, "_") s = hyphens.ReplaceAllLiteralString(s, "_") s = unders.ReplaceAllLiteralString(s, "_") s = strings.TrimSuffix(s, "_") return s }