Back (Current repo: scraps)

random scraps and notes that are useful to me
To clone this repository:
git clone https://git.viktor1993.net/scraps.git
Log | Download | Files | Refs

csv_to_sql_table.awk (7067B)


#!/bin/awk

function push(arr, value) {
    arr[length(arr)+1] = value;
}

function strip(s) {
    sub(/^ */, "", s);
    sub(/ *$/, "", s);
    return s;
}

function is_integer(s) {
    return s ~ /^[-+]?[0-9]+$/;
}

function is_date(s) {
    return s ~ /^((([0-9]{4})(-)(0[13578]|10|12)(-)(0[1-9]|[12][0-9]|3[01]))|(([0-9]{4})(-)(0[469]|11)(-)(0[1-9]|[12][0-9]|30))|(([0-9]{4})(-)(02)(-)(0[1-9]|[12][0-9]|2[0-8]))|(([02468][048]00)(-)(02)(-)(29))|(([13579][26]00)(-)(02)(-)(29))|(([0-9][0-9][0][48])(-)(02)(-)(29))|(([0-9][0-9][2468][048])(-)(02)(-)(29))|(([0-9][0-9][13579][26])(-)(02)(-)(29)))$/;
}

function is_datetime(s) {
    return s ~ /^((([0-9]{4})(-)(0[13578]|10|12)(-)(0[1-9]|[12][0-9]|3[01]))|(([0-9]{4})(-)(0[469]|11)(-)(0[1-9]|[12][0-9]|30))|(([0-9]{4})(-)(02)(-)(0[1-9]|[12][0-9]|2[0-8]))|(([02468][048]00)(-)(02)(-)(29))|(([13579][26]00)(-)(02)(-)(29))|(([0-9][0-9][0][48])(-)(02)(-)(29))|(([0-9][0-9][2468][048])(-)(02)(-)(29))|(([0-9][0-9][13579][26])(-)(02)(-)(29)))(\s)(([0-1][0-9]|2[0-4]):([0-5][0-9]):([0-5][0-9]))$/;
}

function is_boolean(s) {
    return s ~ /^(true|false|TRUE|FALSE|True|False|0|1)$/;
}

function start_insert(tbl_name, h_line) {
    if (csv_headers == 1) {
        print("INSERT INTO " tbl_name " (" h_line ") VALUES");
    } else {
        print("INSERT INTO " tbl_name " VALUES");
    }
}

function infer_type(col, type, val) {
    # Type hierarchy: INTEGER -> DATE -> DATETIME -> BOOLEAN -> TEXT
    # Once we see TEXT, it stays TEXT
    if (type == "TEXT") {
        return "TEXT";
    }

    val = strip(val);
    if (val == "") {
        return type;
    }

    if (type == "") {
        # First non-empty value
        if (is_integer(val)) {
            return "INTEGER";
        }
        if (is_boolean(val)) {
            return "BOOLEAN";
        }
        if (is_date(val)) {
            return "DATE";
        }
        if (is_datetime(val)) {
            return "DATETIME";
        }
        return "TEXT";
    }

    if (type == "INTEGER") {
        if (is_integer(val)) {
            return "INTEGER";
        }
        return "TEXT";
    }

    if (type == "DATE") {
        if(is_date(val)) {
            return "DATE";
        }
        if (is_datetime(val)) {
            return "DATETIME";
        }
    }

    if (type == "DATETIME") {
        if (is_datetime(val)) {
            return "DATETIME";
        }
    }

    if (type == "BOOLEAN") {
        if (is_boolean(val)) {
            return "BOOLEAN";
        }
        if (is_integer(val)) {
            return "INTEGER";
        }
        return "TEXT";
    }
    return "TEXT";
}

function emit_table(tbl_idx) {
    if (named_table == 1) {
        tbl_name = table;
    } else {
        tbl_name = "tbl"tbl_idx;
    }
    print("CREATE TABLE IF NOT EXISTS "tbl_name" (");
    for (i = 1; i <= max_cols; i++) {
        col_name = headers[i];
        col_type = col_types[i];

        # If we don't know what it is, assume it's some kind of text
        if (col_type == "") {
            col_type = "TEXT";
        }

        # MariaDB calls booleans "TINYINT(1)"
        if (col_type == "BOOLEAN") {
            col_type = "TINYINT(1)";
        }
        # INTEGER -> INT (but decided to rename it only here for code clarity)
        if (col_type == "INTEGER") {
            col_type = "INT";
        }

        printf("  %s %s%s%s\n", col_name, col_type, i == 1 ? " PRIMARY KEY" : "", i == max_cols ? "" : ",");
    }
    print ");\n";

    if (csv_headers == 1) {
        csv_row = 2;
        h_line = "";
        for (x=1; x <= max_cols; x++) {
            if(!(h_line ~ /^$/)) {
                h_line = h_line","data[1,x];
            } else {
                h_line = data[1,x];
            }
        }
    } else {
        csv_row = 1;
    }

    start_insert(tbl_name,h_line);

    for (row = csv_row; row <= data_rows; row++) {
        if (batch_row > 0) {
            printf(",\n");
        }

        printf("(");
        for (col = 1; col <= max_cols; col++) {
            val = data[row, col];
            val = strip(val);

            if (val == "") {
                printf("NULL");
            } else {
                escaped = val;
                gsub(/'/, "''", escaped);
                if (col_types[col] == "INTEGER" || col_types[col] == "BOOLEAN") {
                    printf("%s", val);
                } else {
                    printf("'%s'", escaped);
                }
            }
            printf("%s", col == max_cols ? "" : ", ");
        }
        printf(")");

        batch_row++;

        # When batch is full, or last row reached, print a ;
        if (batch_row == batch_size || row == data_rows) {
            printf(";\n\n");
            batch_row = 0;

            # Start a new INSERT if not done yet
            if (row < data_rows) {
                print "SELECT SLEEP(1);"
                start_insert(tbl_name,h_line);
            }
        }
    }

    print "";


}

function reset_state() {
    delete headers;
    delete col_types;
    delete data;
    data_rows = 0;
    max_cols = 0;
    has_header = 0;
}

BEGIN {
    FS = ",";
    table_index = 0;
    batch_size = 1000;
    batch_row = 0;
    reset_state();
    if(csv_headers == "") {csv_headers = 1;}
    if(table == "") {named_table = 0;} else {named_table = 1;}
}
{
    if (!(/^$/)) {
        # Store the row
        data_rows++;
        for (i = 1; i <= NF; i++) {
            data[data_rows, i] = $i;
            if (i > max_cols) {
                max_cols = i;
            }
        }

        # If this is the first row, check if it looks like a header
        if (data_rows == 1) {
            has_header = 1;
            for (i = 1; i <= NF; i++) {
                val = strip($i);
                # If any field looks numeric, probably not a header
                if (is_integer(val)) {
                    has_header = 0;
                    break;
                }
            }
        }

        # Set up headers and infer types
        if (has_header && data_rows == 1) {
            # Use first row as headers
            for (i = 1; i <= NF+1; i++) {
                h = strip($i);
                gsub(/[^a-zA-Z0-9_]/, "_", h);
                # In case last header is missing
                if ((h ~ /^$/) || (h ~ /^[0-9]/) || (length(h) == 0)) {
                    h = "synth_col";
                }
                push(headers, h);
            }
        } else {
            # Generate generic headers if needed
            if (data_rows == 1 && !has_header) {
                for (i = 1; i <= max_cols; i++) {
                    headers[i] = "col" i;
                }
            }

            # Infer types from this data row
            start_row = has_header ? 2 : 1;
            if (data_rows >= start_row) {
                for (i = 1; i <= NF; i++) {
                    col_types[i] = infer_type(i, col_types[i], $i);
                }
            }
        }
    }
}
END {
    # Output the final table if we have anything
    if (data_rows > 0) {
        emit_table(++table_index);
    }
}