csv_to_sql_table.awk (7067B)
#!/bin/awk
function push(arr, value) {
arr[length(arr)+1] = value;
}
function strip(s) {
sub(/^ */, "", s);
sub(/ *$/, "", s);
return s;
}
function is_integer(s) {
return s ~ /^[-+]?[0-9]+$/;
}
function is_date(s) {
return s ~ /^((([0-9]{4})(-)(0[13578]|10|12)(-)(0[1-9]|[12][0-9]|3[01]))|(([0-9]{4})(-)(0[469]|11)(-)(0[1-9]|[12][0-9]|30))|(([0-9]{4})(-)(02)(-)(0[1-9]|[12][0-9]|2[0-8]))|(([02468][048]00)(-)(02)(-)(29))|(([13579][26]00)(-)(02)(-)(29))|(([0-9][0-9][0][48])(-)(02)(-)(29))|(([0-9][0-9][2468][048])(-)(02)(-)(29))|(([0-9][0-9][13579][26])(-)(02)(-)(29)))$/;
}
function is_datetime(s) {
return s ~ /^((([0-9]{4})(-)(0[13578]|10|12)(-)(0[1-9]|[12][0-9]|3[01]))|(([0-9]{4})(-)(0[469]|11)(-)(0[1-9]|[12][0-9]|30))|(([0-9]{4})(-)(02)(-)(0[1-9]|[12][0-9]|2[0-8]))|(([02468][048]00)(-)(02)(-)(29))|(([13579][26]00)(-)(02)(-)(29))|(([0-9][0-9][0][48])(-)(02)(-)(29))|(([0-9][0-9][2468][048])(-)(02)(-)(29))|(([0-9][0-9][13579][26])(-)(02)(-)(29)))(\s)(([0-1][0-9]|2[0-4]):([0-5][0-9]):([0-5][0-9]))$/;
}
function is_boolean(s) {
return s ~ /^(true|false|TRUE|FALSE|True|False|0|1)$/;
}
function start_insert(tbl_name, h_line) {
if (csv_headers == 1) {
print("INSERT INTO " tbl_name " (" h_line ") VALUES");
} else {
print("INSERT INTO " tbl_name " VALUES");
}
}
function infer_type(col, type, val) {
# Type hierarchy: INTEGER -> DATE -> DATETIME -> BOOLEAN -> TEXT
# Once we see TEXT, it stays TEXT
if (type == "TEXT") {
return "TEXT";
}
val = strip(val);
if (val == "") {
return type;
}
if (type == "") {
# First non-empty value
if (is_integer(val)) {
return "INTEGER";
}
if (is_boolean(val)) {
return "BOOLEAN";
}
if (is_date(val)) {
return "DATE";
}
if (is_datetime(val)) {
return "DATETIME";
}
return "TEXT";
}
if (type == "INTEGER") {
if (is_integer(val)) {
return "INTEGER";
}
return "TEXT";
}
if (type == "DATE") {
if(is_date(val)) {
return "DATE";
}
if (is_datetime(val)) {
return "DATETIME";
}
}
if (type == "DATETIME") {
if (is_datetime(val)) {
return "DATETIME";
}
}
if (type == "BOOLEAN") {
if (is_boolean(val)) {
return "BOOLEAN";
}
if (is_integer(val)) {
return "INTEGER";
}
return "TEXT";
}
return "TEXT";
}
function emit_table(tbl_idx) {
if (named_table == 1) {
tbl_name = table;
} else {
tbl_name = "tbl"tbl_idx;
}
print("CREATE TABLE IF NOT EXISTS "tbl_name" (");
for (i = 1; i <= max_cols; i++) {
col_name = headers[i];
col_type = col_types[i];
# If we don't know what it is, assume it's some kind of text
if (col_type == "") {
col_type = "TEXT";
}
# MariaDB calls booleans "TINYINT(1)"
if (col_type == "BOOLEAN") {
col_type = "TINYINT(1)";
}
# INTEGER -> INT (but decided to rename it only here for code clarity)
if (col_type == "INTEGER") {
col_type = "INT";
}
printf(" %s %s%s%s\n", col_name, col_type, i == 1 ? " PRIMARY KEY" : "", i == max_cols ? "" : ",");
}
print ");\n";
if (csv_headers == 1) {
csv_row = 2;
h_line = "";
for (x=1; x <= max_cols; x++) {
if(!(h_line ~ /^$/)) {
h_line = h_line","data[1,x];
} else {
h_line = data[1,x];
}
}
} else {
csv_row = 1;
}
start_insert(tbl_name,h_line);
for (row = csv_row; row <= data_rows; row++) {
if (batch_row > 0) {
printf(",\n");
}
printf("(");
for (col = 1; col <= max_cols; col++) {
val = data[row, col];
val = strip(val);
if (val == "") {
printf("NULL");
} else {
escaped = val;
gsub(/'/, "''", escaped);
if (col_types[col] == "INTEGER" || col_types[col] == "BOOLEAN") {
printf("%s", val);
} else {
printf("'%s'", escaped);
}
}
printf("%s", col == max_cols ? "" : ", ");
}
printf(")");
batch_row++;
# When batch is full, or last row reached, print a ;
if (batch_row == batch_size || row == data_rows) {
printf(";\n\n");
batch_row = 0;
# Start a new INSERT if not done yet
if (row < data_rows) {
print "SELECT SLEEP(1);"
start_insert(tbl_name,h_line);
}
}
}
print "";
}
function reset_state() {
delete headers;
delete col_types;
delete data;
data_rows = 0;
max_cols = 0;
has_header = 0;
}
BEGIN {
FS = ",";
table_index = 0;
batch_size = 1000;
batch_row = 0;
reset_state();
if(csv_headers == "") {csv_headers = 1;}
if(table == "") {named_table = 0;} else {named_table = 1;}
}
{
if (!(/^$/)) {
# Store the row
data_rows++;
for (i = 1; i <= NF; i++) {
data[data_rows, i] = $i;
if (i > max_cols) {
max_cols = i;
}
}
# If this is the first row, check if it looks like a header
if (data_rows == 1) {
has_header = 1;
for (i = 1; i <= NF; i++) {
val = strip($i);
# If any field looks numeric, probably not a header
if (is_integer(val)) {
has_header = 0;
break;
}
}
}
# Set up headers and infer types
if (has_header && data_rows == 1) {
# Use first row as headers
for (i = 1; i <= NF+1; i++) {
h = strip($i);
gsub(/[^a-zA-Z0-9_]/, "_", h);
# In case last header is missing
if ((h ~ /^$/) || (h ~ /^[0-9]/) || (length(h) == 0)) {
h = "synth_col";
}
push(headers, h);
}
} else {
# Generate generic headers if needed
if (data_rows == 1 && !has_header) {
for (i = 1; i <= max_cols; i++) {
headers[i] = "col" i;
}
}
# Infer types from this data row
start_row = has_header ? 2 : 1;
if (data_rows >= start_row) {
for (i = 1; i <= NF; i++) {
col_types[i] = infer_type(i, col_types[i], $i);
}
}
}
}
}
END {
# Output the final table if we have anything
if (data_rows > 0) {
emit_table(++table_index);
}
}