Back (Current repo: scraps)

random scraps and notes that are useful to me
To clone this repository:
git clone https://git.viktor1993.net/scraps.git
Log | Download | Files | Refs

commit c9dc60e53463fa292c47b54f79b18993a5e53ecd
Author: root <root>
Date:   Mon,  3 Nov 2025 19:12:37 +0100

reset git history to keep stagit sane

Diffstat:
Aawk/compare_single_columns.awk | 24++++++++++++++++++++++++
Aawk/csv_to_sql_table.awk | 267+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aawk/fan_out_chunks.awk | 44++++++++++++++++++++++++++++++++++++++++++++
Aawk/gen_csv_data.awk | 48++++++++++++++++++++++++++++++++++++++++++++++++
Aawk/gen_multi_table.awk | 101+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aawk/gen_multi_table_with_dates.awk | 118+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aawk/group_by_column.awk | 36++++++++++++++++++++++++++++++++++++
Aawk/group_by_column_regex.awk | 39+++++++++++++++++++++++++++++++++++++++
Aawk/reorganize_groups.awk | 82+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aawk/sum_numeric_cols.awk | 34++++++++++++++++++++++++++++++++++
Abash/funcs/daily_range_partition.sh | 56++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Abash/funcs/get_memory_usage.sh | 13+++++++++++++
Abash/funcs/get_subprocesses.sh | 29+++++++++++++++++++++++++++++
Abash/funcs/git_list_unmerged_branches.sh | 55+++++++++++++++++++++++++++++++++++++++++++++++++++++++
Anotes/sql/TODO | 55+++++++++++++++++++++++++++++++++++++++++++++++++++++++
Anotes/sql/binary_and_relay_logs.txt | 19+++++++++++++++++++
Anotes/sql/clustered_vs_nonclustered_index.txt | 35+++++++++++++++++++++++++++++++++++
Anotes/sql/cte_vs_subquery.txt | 5+++++
Anotes/sql/index_not_being_used_reasons_checklist.txt | 7+++++++
Anotes/sql/inner_vs_left_vs_outer_join.txt | 9+++++++++
Anotes/sql/innodb_vs_myisam.txt | 7+++++++
Anotes/sql/isolation_levels.txt | 44++++++++++++++++++++++++++++++++++++++++++++
Anotes/sql/replica_lag_troubleshoot_checklist.txt | 28++++++++++++++++++++++++++++
Anotes/sql/seconds_behind_master.txt | 11+++++++++++
Anotes/sql/sequences_in_mariadb.txt | 12++++++++++++
Anotes/sql/var_lib_mysql_disk_space_filling_up.txt | 13+++++++++++++
Anotes/sql/view_vs_materialized_view.txt | 16++++++++++++++++
Anotes/sql/virtual_columns.txt | 5+++++
Anotes/sql/what_is_foreign_key.txt | 3+++
Anotes/sql/where_vs_having.txt | 3+++
Aperl/git_consistency_check.pl | 231+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aphp/snippets/call_private_function_outside_class.php | 43+++++++++++++++++++++++++++++++++++++++++++
Aphp/snippets/remove_emoji.php | 5+++++
Apractice/hackerrank/sql/print_prime_numbers.md | 41+++++++++++++++++++++++++++++++++++++++++
Apractice/hackerrank/sql/the_report.md | 106+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ased/find_in_paragraph.sh | 26++++++++++++++++++++++++++
Ased/flatten_newlines.sh | 18++++++++++++++++++
Ased/replace_match_and_next2.sh | 33+++++++++++++++++++++++++++++++++
Asql/snippets/change_collation_all_tables_and_columns.sql | 1+
Asql/snippets/check_sp_privs.sql | 1+
Asql/snippets/compare_schema_struct.sql | 65+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asql/snippets/copy_schema_struct.sql | 73+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asql/snippets/db_logging.sql | 29+++++++++++++++++++++++++++++
Asql/snippets/find_max_date_per_group.sh | 17+++++++++++++++++
Asql/snippets/insert_range_of_integers.sql | 8++++++++
Asql/snippets/locking_status_monitor.sql | 14++++++++++++++
Asql/snippets/read_binlog.sh | 1+
Asql/snippets/table_sizes.sql | 4++++
Asql/snippets/trim_table_from_old_data.sql | 22++++++++++++++++++++++
Asql/snippets/update_ranks.sql | 11+++++++++++
50 files changed, 1967 insertions(+), 0 deletions(-)

diff --git a/awk/compare_single_columns.awk b/awk/compare_single_columns.awk @@ -0,0 +1,24 @@ +#!/usr/bin/awk +#USAGE: awk -f compare_single_columns.awk <file1> <file2> +BEGIN{ + print "Only in "ARGV[2]":"; +} +(NR==FNR) { + a[$0]++;next +} +{ + if($0 in a) { + a[$0]="-1" + } else { + print $0 + } +} + +END{ + print "Only in "ARGV[1]":"; + for(x in a) { + if(a[x]!="-1") { + print x; + } + } +} diff --git a/awk/csv_to_sql_table.awk b/awk/csv_to_sql_table.awk @@ -0,0 +1,267 @@ +#!/bin/awk + +function push(arr, value) { + arr[length(arr)+1] = value; +} + +function strip(s) { + sub(/^ */, "", s); + sub(/ *$/, "", s); + return s; +} + +function is_integer(s) { + return s ~ /^[-+]?[0-9]+$/; +} + +function is_date(s) { + return s ~ /^((([0-9]{4})(-)(0[13578]|10|12)(-)(0[1-9]|[12][0-9]|3[01]))|(([0-9]{4})(-)(0[469]|11)(-)(0[1-9]|[12][0-9]|30))|(([0-9]{4})(-)(02)(-)(0[1-9]|[12][0-9]|2[0-8]))|(([02468][048]00)(-)(02)(-)(29))|(([13579][26]00)(-)(02)(-)(29))|(([0-9][0-9][0][48])(-)(02)(-)(29))|(([0-9][0-9][2468][048])(-)(02)(-)(29))|(([0-9][0-9][13579][26])(-)(02)(-)(29)))$/; +} + +function is_datetime(s) { + return s ~ /^((([0-9]{4})(-)(0[13578]|10|12)(-)(0[1-9]|[12][0-9]|3[01]))|(([0-9]{4})(-)(0[469]|11)(-)(0[1-9]|[12][0-9]|30))|(([0-9]{4})(-)(02)(-)(0[1-9]|[12][0-9]|2[0-8]))|(([02468][048]00)(-)(02)(-)(29))|(([13579][26]00)(-)(02)(-)(29))|(([0-9][0-9][0][48])(-)(02)(-)(29))|(([0-9][0-9][2468][048])(-)(02)(-)(29))|(([0-9][0-9][13579][26])(-)(02)(-)(29)))(\s)(([0-1][0-9]|2[0-4]):([0-5][0-9]):([0-5][0-9]))$/; +} + +function is_boolean(s) { + return s ~ /^(true|false|TRUE|FALSE|True|False|0|1)$/; +} + +function start_insert(tbl_name, h_line) { + if (csv_headers == 1) { + print("INSERT INTO " tbl_name " (" h_line ") VALUES"); + } else { + print("INSERT INTO " tbl_name " VALUES"); + } +} + +function infer_type(col, type, val) { + # Type hierarchy: INTEGER -> DATE -> DATETIME -> BOOLEAN -> TEXT + # Once we see TEXT, it stays TEXT + if (type == "TEXT") { + return "TEXT"; + } + + val = strip(val); + if (val == "") { + return type; + } + + if (type == "") { + # First non-empty value + if (is_integer(val)) { + return "INTEGER"; + } + if (is_boolean(val)) { + return "BOOLEAN"; + } + if (is_date(val)) { + return "DATE"; + } + if (is_datetime(val)) { + return "DATETIME"; + } + return "TEXT"; + } + + if (type == "INTEGER") { + if (is_integer(val)) { + return "INTEGER"; + } + return "TEXT"; + } + + if (type == "DATE") { + if(is_date(val)) { + return "DATE"; + } + if (is_datetime(val)) { + return "DATETIME"; + } + } + + if (type == "DATETIME") { + if (is_datetime(val)) { + return "DATETIME"; + } + } + + if (type == "BOOLEAN") { + if (is_boolean(val)) { + return "BOOLEAN"; + } + if (is_integer(val)) { + return "INTEGER"; + } + return "TEXT"; + } + return "TEXT"; +} + +function emit_table(tbl_idx) { + if (named_table == 1) { + tbl_name = table; + } else { + tbl_name = "tbl"tbl_idx; + } + print("CREATE TABLE IF NOT EXISTS "tbl_name" ("); + for (i = 1; i <= max_cols; i++) { + col_name = headers[i]; + col_type = col_types[i]; + + # If we don't know what it is, assume it's some kind of text + if (col_type == "") { + col_type = "TEXT"; + } + + # MariaDB calls booleans "TINYINT(1)" + if (col_type == "BOOLEAN") { + col_type = "TINYINT(1)"; + } + # INTEGER -> INT (but decided to rename it only here for code clarity) + if (col_type == "INTEGER") { + col_type = "INT"; + } + + printf(" %s %s%s%s\n", col_name, col_type, i == 1 ? " PRIMARY KEY" : "", i == max_cols ? "" : ","); + } + print ");\n"; + + if (csv_headers == 1) { + csv_row = 2; + h_line = ""; + for (x=1; x <= max_cols; x++) { + if(!(h_line ~ /^$/)) { + h_line = h_line","data[1,x]; + } else { + h_line = data[1,x]; + } + } + } else { + csv_row = 1; + } + + start_insert(tbl_name,h_line); + + for (row = csv_row; row <= data_rows; row++) { + if (batch_row > 0) { + printf(",\n"); + } + + printf("("); + for (col = 1; col <= max_cols; col++) { + val = data[row, col]; + val = strip(val); + + if (val == "") { + printf("NULL"); + } else { + escaped = val; + gsub(/'/, "''", escaped); + if (col_types[col] == "INTEGER" || col_types[col] == "BOOLEAN") { + printf("%s", val); + } else { + printf("'%s'", escaped); + } + } + printf("%s", col == max_cols ? "" : ", "); + } + printf(")"); + + batch_row++; + + # When batch is full, or last row reached, print a ; + if (batch_row == batch_size || row == data_rows) { + printf(";\n\n"); + batch_row = 0; + + # Start a new INSERT if not done yet + if (row < data_rows) { + print "SELECT SLEEP(1);" + start_insert(tbl_name,h_line); + } + } + } + + print ""; + + +} + +function reset_state() { + delete headers; + delete col_types; + delete data; + data_rows = 0; + max_cols = 0; + has_header = 0; +} + +BEGIN { + FS = ","; + table_index = 0; + batch_size = 1000; + batch_row = 0; + reset_state(); + if(csv_headers == "") {csv_headers = 1;} + if(table == "") {named_table = 0;} else {named_table = 1;} +} +{ + if (!(/^$/)) { + # Store the row + data_rows++; + for (i = 1; i <= NF; i++) { + data[data_rows, i] = $i; + if (i > max_cols) { + max_cols = i; + } + } + + # If this is the first row, check if it looks like a header + if (data_rows == 1) { + has_header = 1; + for (i = 1; i <= NF; i++) { + val = strip($i); + # If any field looks numeric, probably not a header + if (is_integer(val)) { + has_header = 0; + break; + } + } + } + + # Set up headers and infer types + if (has_header && data_rows == 1) { + # Use first row as headers + for (i = 1; i <= NF+1; i++) { + h = strip($i); + gsub(/[^a-zA-Z0-9_]/, "_", h); + # In case last header is missing + if ((h ~ /^$/) || (h ~ /^[0-9]/) || (length(h) == 0)) { + h = "synth_col"; + } + push(headers, h); + } + } else { + # Generate generic headers if needed + if (data_rows == 1 && !has_header) { + for (i = 1; i <= max_cols; i++) { + headers[i] = "col" i; + } + } + + # Infer types from this data row + start_row = has_header ? 2 : 1; + if (data_rows >= start_row) { + for (i = 1; i <= NF; i++) { + col_types[i] = infer_type(i, col_types[i], $i); + } + } + } + } +} +END { + # Output the final table if we have anything + if (data_rows > 0) { + emit_table(++table_index); + } +} diff --git a/awk/fan_out_chunks.awk b/awk/fan_out_chunks.awk @@ -0,0 +1,44 @@ +#!/bin/awk + +#Another awk script for reorganizing data in CSV files. +#The necessity here was similar to some of the other files in here, +#the data format was too awkward for some process to work with, and this was the quickest way to fix it. + +BEGIN{ + FS="@|;"; +} +{ + swp = $1";"$2";"$3";"$4";"$5";"$6; + tmp = swp; + c = 0; + for (i = 7; i <= NF; i++) { + c++; + tmp = tmp";"$i; + if (c % 5 == 0) { + print tmp; + tmp = swp; + } + } +} + +#HOW IT WORKS + +#2927;1;CH0006539198;EUR;200;48543;968@0@91@76@3000;969@0@91@77@1000;970@0@91@78@100 +#3824;1;DE000A1EWWW0;EUR;200;48543;968@0@91@76@3000;969@0@91@77@1000;970@0@91@78@100 + +#2927;1;CH0006539198;EUR;200;48543;968;0;91;76;3000 +#2927;1;CH0006539198;EUR;200;48543;969;0;91;77;1000 +#2927;1;CH0006539198;EUR;200;48543;970;0;91;78;100 +#3824;1;DE000A1EWWW0;EUR;200;48543;968;0;91;76;3000 +#3824;1;DE000A1EWWW0;EUR;200;48543;969;0;91;77;1000 +#3824;1;DE000A1EWWW0;EUR;200;48543;970;0;91;78;100 + +# FS="@|;" -> Delimiter is either @ or ; +# take first 6 fields, save them into a variable called swp to turn them into a constant +# fields 7 to end (NF) fanned out into groups of 5, prefixed by constant swp +# on every 5th appended field, print line and reset buffer +# it's possible to change the magic number 5 by adding a variable from bash +# into BEGIN block -> if (chunk=="") {chunk=5} +# if (c % 5) -> if (c % chunk) +# call the awk script with -v chunk=<NUMBER> + diff --git a/awk/gen_csv_data.awk b/awk/gen_csv_data.awk @@ -0,0 +1,48 @@ +#!/bin/awk + +#A small utility I created when I need some quick and dirty SQL data to test out some queries and ideas + +function isNegative() { + return rand(); +} + +function random(digits, pos_only) { + negative = isNegative(); + if ((negative <= 0.5) && (pos_only == 0)) { + return (rand() * (10 ** digits)) * -1; + } else { + return (rand() * (10 ** digits)); + } +} + +function generateNumber(digits, pos_only, decimal, delimiter){ + result = random(digits, pos_only); + if (decimal == 1) { + return int(result)""delimiter; + } else { + return result""delimiter; + } +} + +BEGIN { + if (digits == ""){digits = 3;} + if (rows == ""){rows = 10;} + if (columns == ""){columns = 4;} + if (pos_only == ""){pos_only = 1;} + if (decimal == ""){decimal = 1;} + if (delimiter == ""){delimiter = ","} + DELIMITER_LEN=length(delimiter)+1; + + for(i = 0; i < rows; i++) { + x=""; + for(j = 0; j < columns; j++) { + if (j + 1 >= columns){ + x = x""generateNumber(digits, pos_only, decimal, delimiter)"\n"; + } else { + x = x""generateNumber(digits, pos_only, decimal, delimiter); + } + } + print substr(x, 1, length(x)-DELIMITER_LEN); + + } +} diff --git a/awk/gen_multi_table.awk b/awk/gen_multi_table.awk @@ -0,0 +1,101 @@ +#!/usr/bin/awk -f +# gen_multi_table.awk +# A small utility I created when I need to some quick and dirty SQL data for multiple tables with joinable IDs. + +#USAGE: +# The utility generates all tables into the same file (to keep the code here simple) +# awk '/^----- TABLE/{n++}{print > "table" n ".csv"}' <output of this script> +# to split them into distinct files + +#PARAMS: +# tables = number of tables (default 2) +# rows = rows per table (default 10) +# columns = non-ID columns per table (default 4) +# digits = number of digits for random numbers (default 3) +# pos_only = 1 to disallow negatives (default 1) +# decimal = 1 to truncate to int, 0 to allow floats (default 1) +# delimiter= column separator (default ,) +# join_mode= 1 for joining everything to table 1, 0 for a "chain" of joins (default 0) + +function isNegative() { + return rand(); +} + +function random(digits, pos_only) { + negative = isNegative(); + if ((negative <= 0.5) && (pos_only == 0)) { + return (rand() * (10 ** digits)) * -1; + } else { + return (rand() * (10 ** digits)); + } +} + +function generateNumber(digits, pos_only, decimal, delimiter){ + result = random(digits, pos_only); + if (decimal == 1) { + return int(result)""delimiter; + } else { + return result""delimiter; + } +} + +function makeHeader(table_index, columns, delimiter) { + header = "id" delimiter; + if (table_index > 1) { + header = header "parent_id" delimiter; + } + for (k = 1; k <= columns; k++) { + if (k == columns) { + header = header "col" k; + } else { + header = header "col" k delimiter; + } + } + return header; +} + +BEGIN { + if (tables == ""){tables = 2;} + if (rows == ""){rows = 5;} + if (columns == ""){columns = 3;} + if (digits == ""){digits = 3;} + if (pos_only == ""){pos_only = 1;} + if (decimal == ""){decimal = 1;} + if (delimiter == ""){delimiter = ",";} + if (join_mode == ""){join_mode = 0;} + DELIMITER_LEN = length(delimiter); + + srand(); + + for (t = 1; t <= tables; t++) { + print "----- TABLE "t" -----"; + print makeHeader(t, columns, delimiter); + + for (i = 1; i <= rows; i++) { + id_val = int(rand() * (10 ** digits)); + ids[t, i] = id_val; + + line = id_val""delimiter; + + if (t > 1) { + parent_index = int(rand() * rows) + 1; + if (join_mode == 0) { + parent_id = ids[t-1, parent_index]; + } else { + parent_id = ids[1, parent_index]; + } + line = line parent_id delimiter; + } + + for (j = 0; j <= columns-1; j++) { + if (j + 1 >= columns-1) + line = line""generateNumber(digits, pos_only, decimal, delimiter); + else + line = line""generateNumber(digits, pos_only, decimal, delimiter); + } + + print substr(line, 1, length(line)-DELIMITER_LEN); + } + print ""; + } +} diff --git a/awk/gen_multi_table_with_dates.awk b/awk/gen_multi_table_with_dates.awk @@ -0,0 +1,118 @@ +#!/usr/bin/awk -f +# gen_multi_table.awk +# A small utility I created when I need to some quick and dirty SQL data for multiple tables with joinable IDs. + +#USAGE: +# The utility generates all tables into the same file (to keep the code here simple) +# awk '/^----- TABLE/{n++}{print > "table" n ".csv"}' <output of this script> +# to split them into distinct files + +#PARAMS: +# tables = number of tables (default 2) +# rows = rows per table (default 10) +# columns = non-ID columns per table (default 4) +# digits = number of digits for random numbers (default 3) +# pos_only = 1 to disallow negatives (default 1) +# decimal = 1 to truncate to int, 0 to allow floats (default 1) +# delimiter= column separator (default ,) +# join_mode= 1 for joining everything to table 1, 0 for a "chain" of joins (default 0) +# gen_date = 1 if want table 1 to generate a column with dates (default 0) + +function isNegative() { + return rand(); +} + +function random(digits, pos_only) { + negative = isNegative(); + if ((negative <= 0.5) && (pos_only == 0)) { + return (rand() * (10 ** digits)) * -1; + } else { + return (rand() * (10 ** digits)); + } +} + +function generateNumber(digits, pos_only, decimal, delimiter){ + result = random(digits, pos_only); + if (decimal == 1) { + return int(result)""delimiter; + } else { + return result""delimiter; + } +} + +function makeHeader(table_index, columns, delimiter, gen_date) { + header = "id" delimiter; + if ((table_index == 1) && (gen_date == 1)) { + header = "id,stamp_created" delimiter; + } + if (table_index > 1) { + header = header "parent_id" delimiter; + } + for (k = 1; k <= columns; k++) { + if (k == columns) { + header = header "col" k; + } else { + header = header "col" k delimiter; + } + } + return header; +} + +function generateDate(years_back, now, past, span, ts) { + now = systime() + past = now - (years_back * 365 * 24 * 60 * 60) + span = now - past + ts = past + int(rand() * span) + return strftime("%Y-%m-%d", ts) +} + +BEGIN { + if (tables == ""){tables = 2;} + if (rows == ""){rows = 5;} + if (columns == ""){columns = 3;} + if (digits == ""){digits = 3;} + if (pos_only == ""){pos_only = 1;} + if (decimal == ""){decimal = 1;} + if (delimiter == ""){delimiter = ",";} + if (join_mode == ""){join_mode = 0;} + if (gen_date == ""){gen_date = 0;} + DELIMITER_LEN = length(delimiter); + + srand(); + + for (t = 1; t <= tables; t++) { + print "----- TABLE "t" -----"; + print makeHeader(t, columns, delimiter, gen_date); + + for (i = 1; i <= rows; i++) { + id_val = int(rand() * (10 ** digits)); + ids[t, i] = id_val; + + line = id_val""delimiter; + + if ((t == 1) && (gen_date == 1)) { + line = line generateDate(10) delimiter; + } + + if (t > 1) { + parent_index = int(rand() * rows) + 1; + if (join_mode == 0) { + parent_id = ids[t-1, parent_index]; + } else { + parent_id = ids[1, parent_index]; + } + line = line parent_id delimiter; + } + + for (j = 0; j <= columns-1; j++) { + if (j + 1 >= columns-1) + line = line""generateNumber(digits, pos_only, decimal, delimiter); + else + line = line""generateNumber(digits, pos_only, decimal, delimiter); + } + + print substr(line, 1, length(line)-DELIMITER_LEN); + } + print ""; + } +} diff --git a/awk/group_by_column.awk b/awk/group_by_column.awk @@ -0,0 +1,36 @@ +#!/bin/awk + +#backstory: data format from dataprovider came in a weird way, made it necessary to compress +#some rows before adding them to the database, to reduce table sizes and boost JOIN speeds +#this part compressed rows to a fewer number which cut a table by ~60% which resulted in a decent boost on the DB side +#also see: reorganize_groups.awk + +BEGIN { + FS = ","; + OFS = ","; +} +{ + # If we've already seen this group ($3), append to existing value + if (a[$3]) { + a[$3] = a[$3] "," $2 "@" $1; + } else { + # First entry for this group + a[$3] = $2 "@" $1; + } +} +END { + # Output all grouped values + for (x in a) { + print x, a[x]; + } +} + +#IN: +#foo,123,groupA +#bar,456,groupA +#baz,789,groupB + +#OUT: +#groupA,123@foo,456@bar +#groupB,789@baz + diff --git a/awk/group_by_column_regex.awk b/awk/group_by_column_regex.awk @@ -0,0 +1,39 @@ +#!/bin/awk + +function push(arr, value) { + arr[length(arr)+1] = value; +} + +# Helper to print the array in semicolon-delimited format +function parray(arr) { + for (i = 0; i < length(arr); i++) { + printf arr[i] ";"; + } + printf "\n"; +} +BEGIN{ + FS=","; + OFS=","; + regex=""; + if (prefix == ""){prefix = 2} + split("",a); +} +{ + #e.g. we want to group together rows that begin with the same two numbers, like 50,501,5010, etc + #because the file contains rows as a hierarchy + if(length($1)==prefix) { + if(length(a) > 0) { + parray(a); + } + split("",a); + regex="^"$1 + push(a,$1"@"$2); + } else { + if($1 ~ regex) { + push(a,$1"@"$2); + } + } +} +END{ + parray(a); +} diff --git a/awk/reorganize_groups.awk b/awk/reorganize_groups.awk @@ -0,0 +1,82 @@ +#!/bin/awk + +#backstory: data format from dataprovider came in a weird way, made it necessary to compress +#some rows before adding them to the database, to reduce table sizes and boost JOIN speeds +#this part expands the compressed rows back to their original number of rows +#not the nicest looking code, but it did cut a table by ~60% which resulted in a decent boost on the DB side +#also see group_by_column.awk + +# Define helper function to push a value to an array +function push(arr, value) { + arr[length(arr)+1] = value; +} + +# Helper to print the array in semicolon-delimited format +function parray(arr) { + for (i = 0; i < length(arr); i++) { + printf arr[i] ";"; + } + printf "\n"; +} + +# Field delimiter = comma, Record delimiter = newline +BEGIN { + FS = ","; + RS = "\n"; +} +{ + t = $1; # First column is the key + + for (x = 2; x <= NF; x++) { # Loop over remaining columns + z = y[0]; # Store previous group prefix for comparison + match($x, /[^@]+/, y); # Extract prefix before @ (the group) + match($x, /@[0-9]+/, w); # Extract value after @ + w_ = substr(w[0], 2); # Remove '@' prefix + + if (x == 2) { + z = y[0]; # On first loop, set z to current group + } + + # If same group, keep appending + if (z == y[0]) { + arr[0] = t; # First element: top-level identifier + arr[1] = z; # Second: group + push(arr, w_); # Then all group values + } + + # Group changes: flush and reset + if (z != y[0]) { + parray(arr); + split("", arr); # Clear array + arr[0] = t; arr[1] = y[0]; + push(arr, w_); + } + + # Final column — always flush and reset + if (x == NF) { + parray(arr); + split("", arr); + next; + } + } +} +END { + printf "\n"; # Clean newline at end +} + + +#HOW IT WORKS +#cat file1 +#dsfs,2@44,2@55,3@66,3@88,3@89,3@90,3@91,4@56,4@777,4@65,5@776,5@545 +#afsf,1@00,1@99 +#bdsa,2@323,2@212,6@432,6@2188 + +#awk -f reorganize_groups.awk <FILE> +#dsfs;2;;44;55; +#dsfs;3;;66;88;89;90;91; +#dsfs;4;;56;777;65; +#dsfs;5;;776;545; +#afsf;1;;00;99; +#bdsa;2;;323;212; +#bdsa;6;;432;2188; + diff --git a/awk/sum_numeric_cols.awk b/awk/sum_numeric_cols.awk @@ -0,0 +1,34 @@ +#!/bin/awk + +BEGIN{ + FS = "," +} + +(NR == 1) { + for (i = 1; i <= NF; i++) { + if ($i ~ /^[0-9]+$/) { + numeric[i] = 1; + header[i] = $i; + cols++; + } + } + if (cols == 0) { + exit 1; + } + next; +} +{ + for (i = 1; i <= NF; i++) { + if (i in numeric && $i ~ /^[0-9]+$/) { + sum[i] += $i; + } + } +} +END{ + printf "Column totals:\n" + for (i = 1; i <= NF; i++) { + if (i in numeric) { + printf " %-12s : %d\n", header[i] ":", sum[i]; + } + } +} diff --git a/bash/funcs/daily_range_partition.sh b/bash/funcs/daily_range_partition.sh @@ -0,0 +1,56 @@ +#!/bin/bash + +DD=$(date +%Y-%m-%d) +TD=$(date -d "$DD + 1 day" +%Y-%m-%d) +MARIADB="mysql -u root -A " + +#create new partition for today +echo "ALTER TABLE some_table DROP PARTITION pFuture; ALTER TABLE some_table ADD PARTITION (PARTITION p$(date -d "$DD" +%Y%m%d) VALUES LESS THAN (TO_DAYS('$TD')), PARTITION pFuture VALUES LESS THAN MAXVALUE);" | $MARIADB + +#drop old partitions +OLD_PART=$(date -d "$DD - 6 months" +%s) + +for PART in $(echo "SELECT partition_name FROM information_schema.partitions WHERE table_name = 'some_table' AND partition_method = 'RANGE';" | $MARIADB --skip-column-names | grep -v Future) +do + PART_DATE="${PART:1}" #drop prefix p to extract date portion + PART_DATE_TEMP=$(date -d "${PART_DATE:0:4}-${PART_DATE:4:2}-${PART_DATE:6:2}" +%s) + + if [[ "$OLD_PART" > "$PART_DATE_TEMP" ]]; then + echo "ALTER TABLE some_table DROP PARTITION ${PART};" | $MARIADB + fi +done + +#EXPLAINER + +#The benefit of partitions in the case of large tables is that the +#queries hitting the table can be told to use only specific partitions +#which means that the query optimizer will not even need to look at +#the rest of the table at all. + +#Effectively, partitioning splits the data in the big table down into separate tables, +#while still getting treated as a single table. When creating a partition, the partition key +#must be added into the PRIMARY KEY (e.g. id,parition_key). Queries with a WHERE clause should +#specify the partition(s) to be used e.g. +#SELECT <stuff> FROM some_table PARTITION(pYYYYMMDD) +#or if using a JOIN, the join should ideally happen on the partition_key + +# Example of a RANGE partitionable table, e.g. some stuff gets written here daily +# and we need to keep this data around for a long time, maybe transaction logs, or so + +#CREATE TABLE `some_table` ( +# `id` varchar(36) NOT NULL, +# `user_id` varchar(36) DEFAULT NULL, +# `date` varchar(36) DEFAULT NULL, +# `interesting_value1` int(11) DEFAULT NULL, +# `interesting_value2` int(11) DEFAULT NULL, +# `partition_key` datetime NOT NULL DEFAULT current_timestamp(), +# PRIMARY KEY (`id`,`partition_key`) +#) ENGINE=InnoDB; + +#ALTER TABLE some_table +#PARTITION BY RANGE (TO_DAYS(partition_key)) ( +# PARTITION pYYYYMMDD VALUES LESS THAN (TO_DAYS('YYYY-MM-DD+1')), +# PARTITION pFuture VALUES LESS THAN MAXVALUE +#); + + diff --git a/bash/funcs/get_memory_usage.sh b/bash/funcs/get_memory_usage.sh @@ -0,0 +1,13 @@ +#!/bin/bash +function get_memory_usage() { + ps -eo size,pid,user,command --sort -size | awk 'BEGIN{print "Memory (Mb) What"} + { + hr=$1/1024; + printf("%13.2f Mb ",hr); + } + { + for ( x=4 ; x<=NF ; x++ ) { + printf("%s \n",$x); + } + }' | awk '(NF==3){gsub(/^ +/, "", $0); print $0}' +} diff --git a/bash/funcs/get_subprocesses.sh b/bash/funcs/get_subprocesses.sh @@ -0,0 +1,29 @@ +#!/bin/bash +function get_subprocesses() { + + if [[ -z "$1" ]]; then + echo "Usage: $0 <PID>" + return 1 + fi + + function getpids() { + echo -n $1 " " + for sub in $(ps -o pid --no-headers --ppid "$1"); do + echo -n $sub $(getpids $sub) " " + done; + } + ps f $(getpids "$1") +} + + + # Explanation for those who wandered in here + # $(ps -o pid --no-headers --ppid $1) -> get all child process id's of the process id given by $1 + # recursively call getpids() on each child until we run out of descendant process id's + # finally, call ps f on the whole thing, to print a process family + + # This can be useful when a script might spawn many subprocesses, such as other scripts in parallel, + # sql connections, etc. it can be a way to check the status of a script i.e. how far down towards the + # script execution has reached + # It can also be useful to kill processes without leaving orphaned sub-processes, e.g. killing a script + # as well as any mysql connections it spawned -- in general, killing a bash script will not kill any + # spawned mysql processes diff --git a/bash/funcs/git_list_unmerged_branches.sh b/bash/funcs/git_list_unmerged_branches.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +function check_unmerged_git_branches() { + + GIT_DIR=$1 + EMAIL=$2 + + if [ -z $GIT_DIR ]; then + echo "Must specify the path of a git repo" + return 1 + fi + + if [ -z $EMAIL ]; then + echo "Must specify recipient(s)" + return 1 + fi + + if [ ! -d "$GIT_DIR/.git" ]; then + echo "Directory is not a git repo: $GIT_DIR" + return 1 + fi + + THIS_DIR=$(echo $PWD); + THIS_SCRIPT=$(echo $PWD | sed -E 's/(.*[^a-z\-])?([a-z\-]{1,}).*/\2/'); #name of the directory this script is in - this makes sense if the directory name is more relevant and the script itself is just named something like "run" + NOW=$(date +%Y%m%d) + + if [ -f ./branches.txt ]; then + rm ./branches.txt; + fi + + if [ -f ./branches_results.txt ]; then + rm ./branches_results.txt; + fi + + cd $GIT_DIR; + + for branch in $(git branch -r --no-merged | grep -v "HEAD"); do + echo -e $branch"," $(git log --no-merges -n1 --format="%ct, %ci, %cr, %an, %ae" $branch | head -n1) | xargs echo >> "$THIS_DIR/branches.txt"; + done + + cd $THIS_DIR; + + echo -e "Unmerged old branches check\nThis script checks for any un-merged branches on the remote that are older than 30 days.\n\n" >> ./branches_results.txt + + cat ./branches.txt | awk 'BEGIN{FS=","; prev_month_stamp=(systime() - 30 * 24 * 60 * 60);}{if($2 > prev_month_stamp){next;} else {print $0;}}' >> ./branches_results.txt; + + cat << EOF >> branches_results.txt + + Please check if you have any unmerged branches in the remote repository, and either merge them or delete them. + ------------------------------------------------------------------------------------------- + Generated by ${THIS_SCRIPT} +EOF + + cat ./branches_results.txt | mail -s "[INFO]: Unmerged old branches check @ ${NOW}" "$EMAIL" +} diff --git a/notes/sql/TODO b/notes/sql/TODO @@ -0,0 +1,55 @@ +Ration about how NULLs affect queries + +BUILD QUERY FRAGMENTS FOR FAST RECALL: + + + +top-N per group (ROW_NUMBER() + PARTITION BY) + +SELECT * +FROM ( + SELECT + dept, + employee, + salary, + ROW_NUMBER() OVER (PARTITION BY dept ORDER BY salary DESC) AS rn + FROM employees +) ranked +WHERE rn <= 3; + +"Give me the top three salaries per department." + +running totals (SUM() OVER) + +SELECT + order_date, + SUM(amount) OVER (ORDER BY order_date) AS running_total +FROM orders; + +"Give me an order-flow report" + +conditional aggregation (SUM(CASE WHEN ...)) + +SELECT + dept, + SUM(CASE WHEN symbol= 'W' THEN 1 ELSE 0 END) AS windows, + SUM(CASE WHEN symbol = 'L' THEN 1 ELSE 0 END) AS linux +FROM machines +GROUP BY dept; + +"Give me a count of employee windows and linux machines" + +top-N per group alternative + +SELECT e.* +FROM employees e +JOIN ( + SELECT dept, MAX(salary) AS max_salary + FROM employees + GROUP BY dept +) m ON e.dept = m.dept AND e.salary = m.max_salary; + +"Give me highest salary per employee department" + + +pivot/unpivot tricks - covered by own blog post, I think? diff --git a/notes/sql/binary_and_relay_logs.txt b/notes/sql/binary_and_relay_logs.txt @@ -0,0 +1,19 @@ +What are the key differences between MariaDB's binlog and relay log? + +The master writes all transactions into its binary log. This is usually a sequential and very lightweight activity. + +The replica server reads the transactions from masters binary log and writes them to its relay log. +Only after that the replica executes the statements from its relay log. + +From the POV of Master: + +Whenever a replica server connects to a master, the master creates a new thread for the connection (similar to any other server client) and then it does whatever the replica user asks. Most of that is going to be a request a feed of events from the master's binary log and to get notified about newly written events to the master's binary log. + +Replica servers that are up to date will mostly be reading events that are still cached in OS cache on the master server, so usually, there shouldn't be too much physical disk reads on the master server in order to feed binary log events to the replica.. The most common exception is when reconnecting a replica server that may have been down for a couple of hours for one reason or another as it will start with reading binary logs that were written hours ago, which the master may no longer have cached. + +From the POV of the Replica: + +The I/O thread process connects to the designated master, reads its binary log events as they come in and just copies them over to a local log file called relay log. This leads us to seeing a second process, the The second process, the SQL thread, which reads events from the relay log stored locally on the replication slave (the file that was written by I/O thread) and then applies them as fast as possible. + +Basically, the relay log is just a local file buffering the master's binlog contents. Separately, the SQL thread reads the relay log and executes the events to replay the master's changes. This two-step model (I/O thread + SQL thread) decouples network transfer from SQL execution, helping replicas survive short disconnects from the master server. + diff --git a/notes/sql/clustered_vs_nonclustered_index.txt b/notes/sql/clustered_vs_nonclustered_index.txt @@ -0,0 +1,35 @@ +What is the difference between clustered and non-clustered index? + + +# Clustered Index + +Definition: A clustered index determines the physical order of data in a table. The rows of the table are stored in the same order as the clustered index. + +Properties: + +-> Only one per table. Each table can only support one clustered index, since the rows of the table can only be stored in one order. + +-> Often it is the PK. When a Primary Key is defined, it's created as a clustered index. + +-> The data and the index are stored together. + +-> Help speed up queries that inspect a range of values e.g. BETWEEN, ORDER BY. + +-> Slows down INSERT/DELETE/UPDATE queries, since these operations can cause a re-ordering of rows. + + +# Non-Clustered Index + +Definition: A non-clustered index creates a structure that is separate from the table data. This structure contains pointers (row locators) to the data's physical location. + +Properties: + +-> You can have as many as you want. Since this is a separate structure, you can attach as many as you want. + +-> Does not affect the physical order of rows in the table. + +-> Less helpful for range queries or sorting than a clustered index. Still very helpful for speeding up WHERE/JOINs in queries. + +-> Takes up additional space to store the structure of the index. More non-clustered indexes = more space used. + +-> Lower impact on INSERT/DELETE/UPDATE queries. diff --git a/notes/sql/cte_vs_subquery.txt b/notes/sql/cte_vs_subquery.txt @@ -0,0 +1,5 @@ +# Difference between CTE and sub-query + +-> A CTE can be named, a sub-query can only be aliased. This means that a CTE can be referenced many times within a complex SQL query, while a sub-query would have to be re-executed each time. So a CTE can allow for better code reuse. + +-> Code readability. As CTE's go to the front of the Query, it can be a lot less mentally taxing to understand compared to trying to read sub-queries that have 4-5 layers of nestedness. diff --git a/notes/sql/index_not_being_used_reasons_checklist.txt b/notes/sql/index_not_being_used_reasons_checklist.txt @@ -0,0 +1,7 @@ +Q: You notice a query that’s performing slowly. The table involved has several million rows, and the execution plan shows it isn’t using an index even though the column in the WHERE clause has an index. + +What are some possible reasons the optimizer might not be using the index, and how would you go about investigating or fixing it? + +A: There can be a lot of reasons, it can be low cardinality, mismatched data types on a JOIN (e.g. VARCHAR in t1, INT in t2), a query contains an OR (that often kills the optimizer's ability to work with indexes), some function applied to a where clause, like WHERE length(isin) <- unindexable, sometimes the optimizer just does weird stuff, I've seen buggy behaviour with IN ()'s performing better when rewritten to multiple selects using WHERE ='s. + +I'd check with SHOW CREATE TABLE on affected tables if there's a JOIN involved, and check the query with EXPLAIN in any case. In some situations I might also check into htop to see if I see a CPU being maxed out on the long term, that usually indicates a big optimization misfire. diff --git a/notes/sql/inner_vs_left_vs_outer_join.txt b/notes/sql/inner_vs_left_vs_outer_join.txt @@ -0,0 +1,9 @@ +Q: Explain the difference between INNER JOIN, LEFT JOIN, RIGHT JOIN, and FULL OUTER JOIN. + +A: INNER JOIN -> The default option when the keyword JOIN is plainly used. It means that when the two tables are merged, discard anything that's not found in both tables. + +LEFT JOIN -> like INNER JOIN + ALL records from the LEFT table (joined value will appear as NULL for these records) + +RIGHT JOIN -> like LEFT JOIN, but flip which table gets ALL records joined + +OUTER JOIN -> LEFT JOIN + RIGHT JOIN diff --git a/notes/sql/innodb_vs_myisam.txt b/notes/sql/innodb_vs_myisam.txt @@ -0,0 +1,7 @@ +Q: What’s the difference between the InnoDB and MyISAM storage engines in MariaDB, and why might you choose one over the other? + +A: One of the most noticeable difference is that InnoDB uses row locking, whereas MyISAM uses table locking. Under InnoDB, it is possible to modify different rows of a table at the same time, making it much more suitable for high frequency writes/deletes. Under MyISAM, any DML statement will obtain an exclusive lock on the table, and while that lock is held, no other session can perform a SELECT or a DML operation on it. + +MyISAM does not support transactions, so unlike InnoDB, there's no COMMIT and ROLLBACK mechanism. Once a statement is issued, it's done. + +Because of these differences, InnoDB is now the default engine in MariaDB/MySQL. MyISAM's main use case is only for read-heavy, low-concurrency workloads. diff --git a/notes/sql/isolation_levels.txt b/notes/sql/isolation_levels.txt @@ -0,0 +1,44 @@ +Q: In MariaDB, explain what transaction isolation levels are and how they affect concurrency. +Can you name the four standard isolation levels and briefly describe the differences between them? + +A: + +In InnoDB, each transaction gets a snapshot of the data at the moment it starts (technically, when it runs its first consistent read). +After that, the transaction is reading historical versions of rows from that snapshot, not the latest committed data in real time. + +MariaDB supports the 4 different transaction isolation levels. + +* -> READ UNCOMMITTED + +A transaction may see not-yet-committed changes made by other transactions. + +This isolation level is good when making rough estimates(like COUNT(*) or SUM(*)) etc. + +* -> READ COMMITTED + +Under this isolation level, each statement gets a fresh snapshot at the time it begins. So if you issue two SELECTs one after another, the second sees the latest committed data right away, even if it’s only microseconds newer. + +* -> REPEATABLE READ + +Your first read defines the snapshot for the entire transaction. You’ll keep seeing that version until you COMMIT or ROLLBACK, regardless of how much real-world time passes. + +REPEATABLE READ is the default isolation level for MySQL and MariaDB. + +* -> SERIALIZABLE + +Same snapshot logic as REPEATABLE READ, but with extra locking so other transactions can’t interfere logically. + +Useful observations: + +When autocommit=1, every statement implicitly starts and ends its own transaction, both REPEATABLE READ and READ COMMITTED behave identically for single statements, and they both see the latest committed data before execution. + +So with autocommit enabled, you’re not running multi-statement transactions at all, and each SELECT, UPDATE, or INSERT is its own short-lived, fully committed transaction. + +Stored procedures are one of the few everyday cases where autocommit=1 doesn’t shield you from transactional differences, because the procedure itself can span multiple statements inside a single transaction. + +Here’s what actually happens: + +Even with autocommit=1 globally, when you execute a stored procedure, it runs as one logical transaction unless you explicitly COMMIT or ROLLBACK inside it. + +So if that procedure does a few INSERTs, UPDATEs, and SELECTs, those statements all share the same transactional context meaning isolation level difference between READ COMMITTED and REPEATABLE READ matters again. + diff --git a/notes/sql/replica_lag_troubleshoot_checklist.txt b/notes/sql/replica_lag_troubleshoot_checklist.txt @@ -0,0 +1,28 @@ +How would you approach troubleshooting replication lag in a MariaDB setup? + +Detection: + +-> use pt-heartbeat to be able to measure replica lag, and set up automated alerts (e.g. email) when lag exceeds thresholds, so I can c +orrelate lag with application (or user) behaviors in real time + +-> set up periodic checks and e-mail alerts for any locking queries, locking issues on master often cascade down to replicas + +Diagnosis: + +-> make sure different applications/scripts have dedicated users on the DB, reduce the number of "power users" to help more easily iden +tify where replica issues originate from + +-> analyze audit logs around time of peak replica spikes, look for big transactions that could be chunked, look for queries holding loc +ks on tables, check that no large DDL happens during peak times + +Mitigation: + +-> Help developers by writing Stored procs that chunk queries (e.g. deletes), or offer them help via scripts if the inputs are data fil +es (chunk into LOAD DATA statements) + +-> Check with developers/business whether certain tables can be ignored from replication, e.g. some tables might be used to help aggreg +ate data, but their contents are effectively useless + +-> Check if certain processes could be moved to different times, to avoid death by a 1000 cuts scenarios + +-> If resources allow, move dwh, analytical and ad-hoc user read queries to dedicated replica server diff --git a/notes/sql/seconds_behind_master.txt b/notes/sql/seconds_behind_master.txt @@ -0,0 +1,11 @@ +In a heavily-loaded MariaDB replication setup, you notice that Seconds_Behind_Master is fluctuating wildly even though your pt-heartbeat shows consistent lag. What could cause this discrepancy, and which tool is more trustworthy? + +Seconds_Behind_Master -> shows how "old" the event currently being executed by the SQL thread is, compared to the master's event timestamp. + + long time_diff= ((long)(time(0) - mi->rli->last_master_timestamp) - mi->clock_diff_with_master); + + +If the SQL thread stalls or catches up in bursts (due to I/O lag, locking, or massive transactions), Seconds_Behind_Master can jump around dramatically. + +pt-heartbeat can give a more accurate measurement, because it inserts current timestamps into a small table, therefore it is measuring "how far back in real time the replica is", and not "how old the binlog event is." + diff --git a/notes/sql/sequences_in_mariadb.txt b/notes/sql/sequences_in_mariadb.txt @@ -0,0 +1,12 @@ +Q: What is the purpose of a sequence in MariaDB? + +A: A sequence is a database object that generates a series of unique, ordered numeric values kind of like a counter managed by the database, separate from any table. + +It is independent of tables, the same sequence can be shared between multiple tables if needed. Can customize start value, increment, min/max and whether it cycles. + +CREATE SEQUENCE invoice_seq + START WITH 1000 + INCREMENT BY 1; + +INSERT INTO invoices (id, customer, amount) +VALUES (NEXT VALUE FOR invoice_seq, 'Alice', 500); diff --git a/notes/sql/var_lib_mysql_disk_space_filling_up.txt b/notes/sql/var_lib_mysql_disk_space_filling_up.txt @@ -0,0 +1,13 @@ +Q:Your MariaDB instance suddenly starts filling /var/lib/mysql faster than usual. +Disk usage keeps growing even though you aren’t importing new data or adding tables. + +What are the most common causes for unexpected data directory growth, and how would you investigate and reclaim space safely? + + +A: +-> Log rotation : easy to miss, but can bite into space on busy systems +-> Slow query log : dumps a ridiculous amount of data on big/busy systems, so it's often kept off for that reason. +-> Snapshots (ZFS) : stale snapshots are invisible to the DB but eat space on disk +-> Script-based observation : cook up some script that does a xargs du -sh on files and look for any anomaly, take things from there depending on the results +-> Binary logs : check `expire_log_days`, whether it's not set excessively high +-> Tablespaces : .ibd files can get inflated, though not an easy fix diff --git a/notes/sql/view_vs_materialized_view.txt b/notes/sql/view_vs_materialized_view.txt @@ -0,0 +1,16 @@ +# Difference between view and materialized view? + +-> View: + +It's just a handy way to save/store a query, but not its results. The results are fetched fresh from the underlying table(s). It can be a nice way for simplifying queries down to something manageable for less technical end users like data analysts, since a view can easily be reused and shared. It's also a neat way to give access only to a specific sub-set of data to some users or roles. + +-> Materialized view + +This is also a way of saving a query, but it materializes the result set, meaning it saves a copy of the data onto the disk. The data does not get automatically refreshed, and becomes stale over time. Accessing data from a materialized view means that i nstead of performing an expensive query over and over, the data is just simply fetched from the saved result set. So despite the data being stale, it can still be useful when the volume of the base data is so big that real-time querying the underlying table would be impractically slow. + +These objects can be treated somewhat similarly to tables, they can have indexes, or be referenced in a foreign key, which is not possible view views. + +The data needs to be explicitly refreshed in a materialized view as a whole. (TRUNCATE the table, INSERT INTO mv SELECT ...) +This would generally be best done in a timeframe that is well outside of peak business hours. + + diff --git a/notes/sql/virtual_columns.txt b/notes/sql/virtual_columns.txt @@ -0,0 +1,5 @@ +Can you explain the concept of virtual columns in MariaDB and provide a use case? + +Virtual columns in MariaDB are columns that are not physically stored in the table (unless explicitly set to STORED), but are instead computed on the fly based on an expression. + +A use case of a virtual column could be to allow indexing to happen in situations where the WHERE clause needs to do typechecking. For example, a query's WHERE clause might do LENGTH(column_value) on column1, in which case an index on column1 would not help. However, if we generate the column with "GENERATED ALWAYS AS LENGTH(column_value), the virtual column can be indexed to boost the speeds of slow selects. diff --git a/notes/sql/what_is_foreign_key.txt b/notes/sql/what_is_foreign_key.txt @@ -0,0 +1,3 @@ +Q: What is a Foreign Key in MariaDB? + +A: A foreign key is a column in one table that links to the primary key in another table, establishing a relationship between the two and enforcing data integrity. The main purpose of this is to prevent the appearance of orphaned records. If one wishes to delete a record from the main table, then the corresponding data must first be purged from the satellite tables. diff --git a/notes/sql/where_vs_having.txt b/notes/sql/where_vs_having.txt @@ -0,0 +1,3 @@ +Q: What is the difference between WHERE and HAVING? + +A: The main difference is that WHERE filters individual rows before any grouping or aggregation occurs, while HAVING filters groups after the GROUP BY clause has been applied and aggregated. An important practicality of this difference is that HAVING only makes sense with SELECT queries, whereas WHERE can be used for UDATE and DELETE queries as well. diff --git a/perl/git_consistency_check.pl b/perl/git_consistency_check.pl @@ -0,0 +1,231 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use File::Temp qw/ tempfile /; +use Fcntl ':flock'; + +open my $self, '<', $0 or die "Couldn't open self: $!"; +flock $self, LOCK_EX | LOCK_NB or die "This script is already running! Aborting!"; + +#EXPLAINER: +# Compare a Git repository (source of truth) to a deployed directory. +# +# Backstory: +# This was created because it was common for folk to add hotfixes in production directly +# which then got overwritten with git deployments causing production incidents +# this script was a first stage necessity in order to clean up, before being able to enforce +# a strict deployment via git only rule. +# +# Recursively checks for: +# - Files that exist in one location but not the other +# - Files with size differences +# +# Useful for: +# - Verifying that a deployment matches the committed source +# - Catching out-of-band edits on production servers +# - Validating integrity of scripts, configs, and assets +# +# Originally written in bash, but it got too unreadable for the job. +# Rewritten in Perl for better sanity and maintainability. +# +# !! Not intended for projects with compiled binaries (hash mismatches are expected) +#USAGE perl <THIS SCRIPT> <GIT REPO PATH> <DEPLOYMENT PATH> + +die "Usage: $0 DIRECTORY1 DIRECTORY2\n" if scalar(@ARGV) < 2; + +my $git_locations = $ARGV[0]; +my $script_locations = $ARGV[1]; + +my @git_found_files; +my @scripts_found_files; +my $file_diffs="./file_diffs.txt"; +my $file_list="./file_list.txt"; +my $changed_files="./changed_files.txt"; +my %processed; + +my $stem1 = $git_locations; +my $stem2 = $script_locations; + +my ($git, $git_files) = tempfile('/tmp/gitrepo-git-consistency-check-XXXXXXXXXXXX', SUFFIX => '.tmp', UNLINK => 0); +my ($scripts, $scripts_files) = tempfile('/tmp/scripts-git-consistency-check-XXXXXXXXXXXX', SUFFIX => '.tmp', UNLINK => 0); + +#recursively search through target directory for all files +sub find_files { + my $path = $_[0]; + my $output = $_[1]; + my @found_files = $_[2]; + + $path .= '/' if($path !~ /\/$/); + + for my $file (glob($path . '*')) { + if(-d $file) { + find_files($file, $output, @found_files); + } else { + print $output $file."\n"; + } + } +} + +#take two lists of files, remove the stems of the path, and compare differences between the lists +sub list_non_present_files { + my $file_diffs = $_[2]; + my $stem1 = $_[3]; + my $stem2 = $_[4]; + my %fl; + my %df; + + open (my $file2,"<",$_[1]) or die "Cannot open file ".$_[1]." for reading: $!"; + my %seen; + while (my $line1 = <$file2>) { + chomp ($line1); + $line1 =~ s/\Q$stem1\E//; + $line1 =~ s/\Q$stem2\E//; + $seen{$line1}++; + } + + close ($file2) or die "Could not finish reading from ".$_[1].": $!"; + + my $match_name = $_[0] =~ /scripts|gitrepo/p; + my $source_name = ${^MATCH}; + + open (my $file1,"<",$_[0]) or die $!; + while (my $line2 = <$file1>) { + chomp $line2; + $line2 =~ s/\Q$stem1\E//; + $line2 =~ s/\Q$stem2\E//; + if($seen{$line2}) { + $fl{$line2}++; + } else { + $df{$line2}++ unless $line2 eq ""; + } + } + + close ($file1) or die "Could not finish reading from ".$_[0].": $!"; + + open(my ($diffs), '>>', $file_diffs) or die "Cannot open file ".$file_diffs." for writing: $!"; + print $diffs "Only in ".$source_name." folder: \n"; + print $diffs "$_\n" for keys %df; + print $diffs "\n"; + close ($diffs) or die "Could not finish writing to ".$file_diffs.": $!"; + + open(my ($flist), '>>', $file_list) or die "Cannot open file ".$file_list." for writing: $!"; + print $flist "$_\n" for keys %fl; + close ($flist) or die "Could not finish writing to ".$file_list.": $!"; + +} + +sub compare_files { + my $file1 = $_[0]; + my $file2 = $_[1]; + + my $filesize1 = -s $file1; + my $filesize2 = -s $file2; + + #skip anything that isn't a normal file + return "" unless -f $file1; + return "" unless -f $file2; + + #TODO: this might not be foolproof in some cases where one file is bigger, but still has less lines + if($filesize2 > $filesize1) { + my $filetemp = $file2; + $file2 = $file1; + $file1 = $filetemp; + } + + my $fname = $file1; + $fname =~ s/^\///; + $fname =~ s/\//-/g; + + my ($co, $current_output) = tempfile('./'.$fname.'XXXXXXXX', SUFFIX => '.tmp', UNLINK => 1); + + open(my $in1,"<",$file1) or die "Cannot open file ".$file1." for reading: $!"; + open(my $in2,"<",$file2) or die "Cannot open file ".$file2." for reading: $!"; + open($co,">>",$current_output) or die "Cannot open file ".$current_output." for writing: $!"; + + my $lineno = 1; + my $is_mismatch = 0; + + #TODO: if a line exists in file1, but is blank line, and line doesn't exist in file2 because it's shorter, the diff is not picked up + while (my $line1 = <$in1>) { + my $line2 = <$in2>; + $line2 = "\n" unless defined $line2; + if ($line1 eq $line2) { + ++$lineno; + next; + } + if($is_mismatch<1) { + print $co "Mismatch between files: \n".$file1."\n".$file2."\nPlease check:\n"; + } + print $co "line :".$lineno."\n"; + print $co "$line1"; + print $co "$line2"; + $is_mismatch=1; + ++$lineno; + } + + if ($is_mismatch == 1 && !exists $processed{$fname}) { + system "gzip $current_output"; + $processed{$fname}++; + } + + close $co or die "Cannot close file: ".$current_output.": $!"; + close $in1 or die "Cannot close file: ".$file1.": $!"; + close $in2 or die "Cannot close file: ".$file2.": $!"; + + return $file1 if $is_mismatch == 1; + return ""; +} + +sub read_list { + my $list = $_[0]; + my $stem1 = $_[1]; + my $stem2 = $_[2]; + my $changed_files = $_[3]; + my %changed_file_list; + my $current_file; + + open(my $ll,'<',$list) or die "Cannot open file ".$list." for reading: $!"; + while (my $line = <$ll>) { + chomp $line; + if ($line =~ /\.jar$|\.gz$/) { + next; + } + $current_file = compare_files($stem1.$line, $stem2.$line); + if ($current_file ne "") { + $current_file =~ s/\Q$stem1\E//; + $current_file =~ s/\Q$stem2\E//; + $changed_file_list{$current_file}++; + } + } + + close $ll or die "Cannot close file: ".$list.": $!"; + + open(my ($chflist), '>>', $changed_files) or die "Cannot open file ".$changed_files." for writing: $!"; + print $chflist "The following files have differences between git repo and deployment: \n"; + print $chflist "$_\n" for keys %changed_file_list; + print $chflist "\n"; + close ($chflist) or die "Cannot close file: ".$changed_files.": $!"; +} + +#locate all the files in the git repo and in the deployment directory +find_files($git_locations, $git, @git_found_files); +find_files($script_locations, $scripts, @scripts_found_files); + +#force close these files if they're not closed, otherwise you hit some weird buffering problem +if($git->opened() == 1) { + close $git or die "Cannot close file: $!"; +} + +if($scripts->opened() == 1) { + close $scripts or die "Cannot close file: $!"; +} + +#Figure out which files exist only in one or the other dir structure, and print the list of differences +list_non_present_files($scripts_files, $git_files, $file_diffs, $stem1, $stem2); +list_non_present_files($git_files, $scripts_files, $file_diffs, $stem1, $stem2); + +#then for the files that exist in BOTH directory structures alike, analyze the differences between all the files +read_list($file_list, $stem1, $stem2, $changed_files); + + diff --git a/php/snippets/call_private_function_outside_class.php b/php/snippets/call_private_function_outside_class.php @@ -0,0 +1,43 @@ +<?php + +/* + * How to access private properties and/or methods from outside the class <- without changing the class itself! + * Why? Sometimes you might be given a framework where changing the class isn't ideal, because a version update + * will overwrite your customizations of said class. + * + * How? Using Closure::call() temporarily binds $this to an instance of an object, giving the closure a + * privileged access to the private members. + * + */ + +class Foo { + private $bar = "Foo and Bar"; + private function add($a, $b) { + $this->c = $a + $b; + return $this->c; + //return $a + $b; + } +} + +$foo = new Foo; + +// Single variable example +// This should not work! +$getFooBar = function() { + return $this->bar; +}; + +// now $this is bound to $foo and PHP lets the closure access the private stuff +// because now it treats it as if it were inside the class at that moment, but it isn't +echo $getFooBar->call($foo); // Prints Foo and Bar +echo PHP_EOL; + +// Function call with parameters example +$doAdd = function() { + return $this->add(...func_get_args()); +}; + +echo $doAdd->call($foo, 10, 5); +echo PHP_EOL; + +?> diff --git a/php/snippets/remove_emoji.php b/php/snippets/remove_emoji.php @@ -0,0 +1,5 @@ +<?php + +function removeEmoji($string, $replacement = '') { + return preg_replace('/(?:\xF0[\x90-\xBF][\x80-\xBF]{2}|[\xF1-\xF3][\x80-\xBF]{3}|\xF4[\x80-\x8F][\x80-\xBF]{2})/s', $replacement, $string); +} diff --git a/practice/hackerrank/sql/print_prime_numbers.md b/practice/hackerrank/sql/print_prime_numbers.md @@ -0,0 +1,41 @@ +# Print Prime Numbers + +Problem Statement: + +Write a query to print all prime numbers less than or equal to 1000. Print your result on a single line, and use the ampersand character as your separator (instead of a space). + +For example, the output for all prime numbers less than 10 would be: + +``` +2&3&5&7 +``` + +Solution: + +```sql +CREATE TEMPORARY TABLE Numbers ( + Number INT PRIMARY KEY +); + +INSERT INTO Numbers +WITH RECURSIVE OddNumbers AS ( + SELECT 3 AS Number + UNION ALL + SELECT Number + 2 FROM OddNumbers WHERE Number + 2 <= 1000 +) +SELECT Number FROM OddNumbers; + +INSERT INTO Numbers VALUES(2); + +CREATE TEMPORARY TABLE Divisors AS +SELECT * FROM Numbers WHERE Number > 1; + +SELECT GROUP_CONCAT(N.Number SEPARATOR '&') AS PrimeNumbers +FROM Numbers N +WHERE NOT EXISTS ( + SELECT 1 + FROM Divisors D + WHERE D.Number * D.Number <= N.Number + AND N.Number % D.Number = 0 +); +``` diff --git a/practice/hackerrank/sql/the_report.md b/practice/hackerrank/sql/the_report.md @@ -0,0 +1,106 @@ +# The Report + +[Link](https://www.hackerrank.com/challenges/the-report/problem) + +Tables: + +```sql +CREATE TABLE Students (ID INT, Name VARCHAR(255), Marks INT, PRIMARY KEY(ID)) ENGINE=InnoDB; +CREATE TABLE Grades (Grade INT, Min_Mark INT, Max_Mark INT, PRIMARY KEY(Grade)) ENGINE=InnoDB; +``` + +Data: + +```sql +INSERT INTO test.Students (ID,Name,Marks) VALUES +(1,'Britney',95), +(2,'Heraldo',94), +(3,'Julia',96), +(4,'Kristeen',100), +(5,'Stuart',99), +(6,'Amina',89), +(7,'Christene',88), +(8,'Salma',81), +(9,'Samantha',87), +(10,'Scarlet',80), +(11,'Vivek',84), +(12,'Aamina',77), +(13,'Belvet',78), +(14,'Paige',74), +(15,'Priya',76), +(16,'Priyanka',77), +(17,'Anna',64), +(18,'Azamat',66), +(19,'Tracey',55), +(20,'Kim',34), +(21,'George',24); +``` + +```sql +INSERT INTO test.Grades (Grade,Min_Mark,Max_Mark) VALUES +(1,0,9), +(2,10,19), +(3,20,29), +(4,30,39), +(5,40,49), +(6,50,59), +(7,60,69), +(8,70,79), +(9,80,89), +(10,90,100); +``` + +The problem statement: +Ketty gives Eve a task to generate a report containing three columns: Name, Grade and Mark. Ketty doesn't want the NAMES of those students who received a grade lower than 8. The report must be in descending order by grade -- i.e. higher grades are entered first. If there is more than one student with the same grade (8-10) assigned to them, order those particular students by their name alphabetically. Finally, if the grade is lower than 8, use "NULL" as their name and list them by their grades in descending order. If there is more than one student with the same grade (1-7) assigned to them, order those particular students by their marks in ascending order. + +Write a query to help Eve. + + +Solution + +```sql +MariaDB [test]> SELECT + -> CASE + -> WHEN s.Marks REGEXP '(7[0-9])|(8[0-9])|(9[0-9])|(100)' THEN s.Name + -> ELSE NULL + -> END AS Name, + -> CASE + -> WHEN s.Marks REGEXP '^([0-9])$' THEN 1 + -> WHEN s.Marks REGEXP '^(1[0-9])$' THEN 2 + -> WHEN s.Marks REGEXP '^(2[0-9])$' THEN 3 + -> WHEN s.Marks REGEXP '^(3[0-9])$' THEN 4 + -> WHEN s.Marks REGEXP '^(4[0-9])$' THEN 5 + -> WHEN s.Marks REGEXP '^(5[0-9])$' THEN 6 + -> WHEN s.Marks REGEXP '^(6[0-9])$' THEN 7 + -> WHEN s.Marks REGEXP '^(7[0-9])$' THEN 8 + -> WHEN s.Marks REGEXP '^(8[0-9])$' THEN 9 + -> WHEN s.Marks REGEXP '^(9[0-9])|(100)$' THEN 10 + -> ELSE 0 + -> END AS Grade, s.Marks FROM Students s ORDER BY Grade DESC, Name ASC, s.Marks ASC; ++-----------+-------+-------+ +| Name | Grade | Marks | ++-----------+-------+-------+ +| Britney | 10 | 95 | +| Heraldo | 10 | 94 | +| Julia | 10 | 96 | +| Kristeen | 10 | 100 | +| Stuart | 10 | 99 | +| Amina | 9 | 89 | +| Christene | 9 | 88 | +| Salma | 9 | 81 | +| Samantha | 9 | 87 | +| Scarlet | 9 | 80 | +| Vivek | 9 | 84 | +| Aamina | 8 | 77 | +| Belvet | 8 | 78 | +| Paige | 8 | 74 | +| Priya | 8 | 76 | +| Priyanka | 8 | 77 | +| NULL | 7 | 64 | +| NULL | 7 | 66 | +| NULL | 6 | 55 | +| NULL | 4 | 34 | +| NULL | 3 | 24 | ++-----------+-------+-------+ +21 rows in set (0.000 sec) +``` diff --git a/sed/find_in_paragraph.sh b/sed/find_in_paragraph.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +function find_in_paragraph() { + + PAT="$1" + FILE="$2" + + if [ -z $PAT ]; then + echo "missing pattern" + return 1; + fi + + if [ -z $FILE ]; then + echo "missing file" + return 1; + fi + + sed -e '/./{H;$!d;}' -e 'x;/'$PAT'/!d;' + + #Explanation for the random wanderer looking in here: + #'/./{H;$!d;}' -> append each line to the old space, and keep buffering, but only if the line is NOT blank + #x -> at a blank line (or end of file) swap butter into pattern space + #/'$PAT'/!d -> if $PAT is NOT in the paragraph, then delete the paragraph + + #if you want to match more stuff, just keep slapping more patterns into it -> 'x;/'$PAT'/!d;/'$PAT2'/!d;' etc. +} diff --git a/sed/flatten_newlines.sh b/sed/flatten_newlines.sh @@ -0,0 +1,18 @@ +function flatten_newlines() { + + FILE="$1" + + if [ -z $FILE ]; then + echo "A file is needed" + return 1 + fi + + sed ':m;$!{N;s/\n/ /;bm;}' "$FILE" + + #Explanation for the random wanderer looking in here: + #:m -> define label m + #$!{} -> If current line is not the last line + #N -> append next line to pattern space, aka join the lines + #s\n/ / -> replace the newline between them with a space + #b m -> loop back to label m +} diff --git a/sed/replace_match_and_next2.sh b/sed/replace_match_and_next2.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +function replace_match_and_next2() { + + PAT="$1" + FILE="$2" + + if [ -z $PAT ]; then + echo "A pattern is needed"; + return 1; + fi + + if [ -z $FILE ]; then + echo "A file is needed"; + return 1; + fi + + sed -n -e '/'$PAT'/!p;: m' -e '//{' -e '$!{' -e 'n;n;b m' -e '}' -e'}' $FILE + + #Explanation for the random wanderer looking in here: + #sed -n \ + # -e '/'$PAT'/!p' print lines that do NOT match $PAT + # -e ':m' define label "m" + # -e '//{' if line matches $PAT again + # -e ' $!{' and it's NOT the last line + # -e ' n; n; b m' skip next two lines and go back to label "m" + # -e ' }' end inner conditional + # -e '}' end outer conditional + + #sometimes this can be useful for cutting parts from responses that are not needed + #but have no control over + +} diff --git a/sql/snippets/change_collation_all_tables_and_columns.sql b/sql/snippets/change_collation_all_tables_and_columns.sql @@ -0,0 +1 @@ +SELECT CONCAT("ALTER TABLE ", TABLE_SCHEMA, '.', TABLE_NAME," CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;") AS executethis FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA="some_schema" AND TABLE_TYPE="BASE TABLE"; diff --git a/sql/snippets/check_sp_privs.sql b/sql/snippets/check_sp_privs.sql @@ -0,0 +1 @@ +SELECT CONCAT('GRANT EXECUTE ON PROCEDURE ', db, '.', routine_name, ' TO ', '\'', user '\'', '@', '\'', host '\'', ';') FROM mysql.procs_priv WHERE routine_name = 'some_stored_proc'; diff --git a/sql/snippets/compare_schema_struct.sql b/sql/snippets/compare_schema_struct.sql @@ -0,0 +1,65 @@ +USE some_schema; + +DROP PROCEDURE IF EXISTS `compare_schemas`; + +DELIMITER $$ + +CREATE PROCEDURE compare_schemas(IN source_db VARCHAR(64), IN target_db VARCHAR(64)) +BEGIN + SELECT + 'Only in source' AS exist_type, + c1.table_schema, c1.table_name, c1.column_name, c1.ordinal_position, c1.column_default, c1.is_nullable, c1.numeric_precision, + c1.numeric_scale, c1.character_set_name, c1.collation_name, c1.column_type, c1.column_key, c1.extra, c1.column_comment + FROM + (SELECT * FROM information_schema.columns WHERE TABLE_SCHEMA = source_db) c1 + LEFT JOIN (SELECT * FROM information_schema.columns WHERE TABLE_SCHEMA = target_db) c2 + ON c1.TABLE_name = c2.TABLE_name AND c1.column_name = c2.column_name + WHERE c2.column_name IS NULL + + UNION ALL + + SELECT + 'Only in target' AS exist_type, + c2.table_schema, c2.table_name, c2.column_name, c2.ordinal_position, c2.column_default, c2.is_nullable, c2.numeric_precision, + c2.numeric_scale, c2.character_set_name, c2.collation_name, c2.column_type, c2.column_key, c2.extra, c2.column_comment + FROM + (SELECT * FROM information_schema.columns WHERE TABLE_SCHEMA = source_db) c1 + RIGHT JOIN (SELECT * FROM information_schema.columns WHERE TABLE_SCHEMA = target_db) c2 + ON c1.TABLE_name = c2.TABLE_name AND c1.column_name = c2.column_name + WHERE c1.column_name IS NULL + + UNION ALL + + SELECT + 'In both schemas' AS exist_type, + CONCAT(c1.table_schema, '/', c2.table_schema), + c1.table_name, c1.column_name, + IF(c1.ordinal_position = c2.ordinal_position OR c1.ordinal_position IS NULL AND c2.ordinal_position IS NULL, NULL, + CONCAT_WS('/', IFNULL(c1.ordinal_position, ''), IFNULL(c2.ordinal_position, ''))), + IF(c1.column_default = c2.column_default OR c1.column_default IS NULL AND c2.column_default IS NULL, NULL, + CONCAT_WS('/', IFNULL(c1.column_default, ''), IFNULL(c2.column_default, ''))), + IF(c1.is_nullable = c2.is_nullable OR c1.is_nullable IS NULL AND c2.is_nullable IS NULL, NULL, + CONCAT_WS('/', IFNULL(c1.is_nullable, ''), IFNULL(c2.is_nullable, ''))), + IF(c1.numeric_precision = c2.numeric_precision OR c1.numeric_precision IS NULL AND c2.numeric_precision IS NULL, NULL, + CONCAT_WS('/', IFNULL(c1.numeric_precision, ''), IFNULL(c2.numeric_precision, ''))), + IF(c1.numeric_scale = c2.numeric_scale OR c1.numeric_scale IS NULL AND c2.numeric_scale IS NULL, NULL, + CONCAT_WS('/', IFNULL(c1.numeric_scale, ''), IFNULL(c2.numeric_scale, ''))), + IF(c1.character_set_name = c2.character_set_name OR c1.character_set_name IS NULL AND c2.character_set_name IS NULL, NULL, + CONCAT_WS('/', IFNULL(c1.character_set_name, ''), IFNULL(c2.character_set_name, ''))), + IF(c1.collation_name = c2.collation_name OR c1.collation_name IS NULL AND c2.collation_name IS NULL, NULL, + CONCAT_WS('/', IFNULL(c1.collation_name, ''), IFNULL(c2.collation_name, ''))), + IF(c1.column_type = c2.column_type OR c1.column_type IS NULL AND c2.column_type IS NULL, NULL, + CONCAT_WS('/', IFNULL(c1.column_type, ''), IFNULL(c2.column_type, ''))), + IF(c1.column_key = c2.column_key OR c1.column_key IS NULL AND c2.column_key IS NULL, NULL, + CONCAT_WS('/', IFNULL(c1.column_key, ''), IFNULL(c2.column_key, ''))), + IF(c1.extra = c2.extra OR c1.extra IS NULL AND c2.extra IS NULL, NULL, + CONCAT_WS('/', IFNULL(c1.extra, ''), IFNULL(c2.extra, ''))), + IF(c1.column_comment = c2.column_comment OR c1.column_comment IS NULL AND c2.column_comment IS NULL, NULL, + CONCAT_WS('/', IFNULL(c1.column_comment, ''), IFNULL(c2.column_comment, ''))) + FROM + (SELECT * FROM information_schema.columns WHERE TABLE_SCHEMA = source_db) c1 + JOIN (SELECT * FROM information_schema.columns WHERE TABLE_SCHEMA = target_db) c2 + ON c1.TABLE_name = c2.TABLE_name AND c1.column_name = c2.column_name; +END $$ + +DELIMITER ; diff --git a/sql/snippets/copy_schema_struct.sql b/sql/snippets/copy_schema_struct.sql @@ -0,0 +1,73 @@ +USE some_schema; + +-- May need to add DEFINER +DROP PROCEDURE IF EXISTS `copy_schema_with_config`; + +CREATE PROCEDURE copy_schema_with_config(IN inOldDB VARCHAR(256), IN inNewDB VARCHAR(256), IN copyTables INT(1)) + BEGIN + + DECLARE v_finished INTEGER DEFAULT 0; + DECLARE v_tname varchar(100) DEFAULT ""; + + DECLARE currentTable CURSOR FOR + SELECT `table_name` + FROM INFORMATION_SCHEMA.TABLES + WHERE TABLE_SCHEMA = inOldDB + and TABLE_TYPE='BASE TABLE'; + + DECLARE CONTINUE HANDLER FOR NOT FOUND SET v_finished = 1; + -- Should be able to delete tables even if they have foreign keys + SET FOREIGN_KEY_CHECKS = 0; + set @result=NULL; + + -- Create new schema if it doesn't yet exist + SET @create = CONCAT('CREATE SCHEMA IF NOT EXISTS ',inNewDB,';'); + PREPARE create_schema FROM @create; + EXECUTE create_schema; + DEALLOCATE PREPARE create_schema; + + -- Loop over tables in the old schema + OPEN currentTable; + clone_tables: LOOP + + -- Get the next table name + FETCH currentTable INTO v_tname; + + -- Leave the Loop if we're done + IF v_finished = 1 THEN LEAVE clone_tables; + END IF; + + -- Drop tables in target DB schema if they already exist + SET @drop = CONCAT("DROP TABLE IF EXISTS `" , inNewDB, "`.`", v_tname,"`;"); + PREPARE stmt FROM @drop; + EXECUTE stmt; + DEALLOCATE PREPARE stmt; + + -- Clone the table from the origin + SET @sql = CONCAT("CREATE TABLE `", inNewDB, "`.`", v_tname, "` LIKE `", inOldDB, "`.`", v_tname, "`;"); + PREPARE clone_table FROM @sql; + EXECUTE clone_table; + DEALLOCATE PREPARE clone_table; + + -- Optionally clone some data + -- maybe change this to a for each in kind of thing besides config + IF (copyTables > 0 && v_tname = 'config') THEN + SET @data = CONCAT("INSERT INTO `", inNewDB, "`.`", v_tname, "` SELECT * FROM `", inOldDB, "`.`", v_tname, "`;"); + PREPARE clone_data FROM @data; + EXECUTE clone_data; + DEALLOCATE PREPARE clone_data; + END IF; + + -- Give the user some result message + SET @result = IFNULL(CONCAT(@result,',',v_tname),v_tname); + + END LOOP clone_tables; + + -- Close cursor + + CLOSE currentTable; + SET FOREIGN_KEY_CHECKS = 1; + -- Print result message + SELECT CONCAT("Copied schema structure from ", inOldDB, " to ", inNewDB, ": ", @result); + + END; diff --git a/sql/snippets/db_logging.sql b/sql/snippets/db_logging.sql @@ -0,0 +1,29 @@ +USE some_schema; + +CREATE TABLE some_schema.sp_logging ( + `id` BIGINT(20) UNSIGNED NOT NULL AUTO_INCREMENT, + `runtime` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP(), + `user` VARCHAR(255) NOT NULL, + `server_id` INT UNSIGNED NOT NULL, + `sp` VARCHAR(255) NOT NULL, + `db` VARCHAR(255) NOT NULL, + `comment` VARCHAR(512) NOT NULL, + `processlist` TEXT DEFAULT NULL, + PRIMARY KEY (`id`, server_id), + KEY sp_logging_idx1 (`sp`, `runtime`) +) ; + +DELIMITER $$ +CREATE PROCEDURE some_schema.log(IN i_sp VARCHAR(255), IN i_db VARCHAR(255), i_comment VARCHAR(512)) + MODIFIES SQL DATA +BEGIN + INSERT INTO some_schema.sp_logging VALUES (null, null, CURRENT_USER(), (SELECT VARIABLE_VALUE FROM information_schema.GLOBAL_VARIABLES WHERE VARIABLE_NAME = 'server_id'), i_sp, i_db, i_comment, (SELECT GROUP_CONCAT('[', CONCAT('USER: ', user, 'DB: ', db, 'TIME: ', NOW(), 'SQL: ', LEFT(info,512)), ']') FROM information_schema.processlist WHERE info IS NOT NULL AND id<>connection_id())); + +END $$ + +DELIMITER ; + +-- THE DEFINER OF THIS SP SHOULD HAVE EXECUTE RIGHTS TO IT, AS WELL AS SELECT AND INSERT to this some_schema.sp_logging table!!! +-- GIVE EXEC RIGHTS TO WHICHEVER USERS NEED TO LOG STUFF INTO THEIR TABLE + +-- CALL some_schema.log('the sp', 'the schema', 'some message'); diff --git a/sql/snippets/find_max_date_per_group.sh b/sql/snippets/find_max_date_per_group.sh @@ -0,0 +1,17 @@ +#!/bin/bash +#Select id and datetime field, convert datetime to unix timestamp, find max value per group and print. + +echo "SELECT id,date FROM <schema>.<table>;" | mysql -u root -A | \ + sed 's/\t/,/g' | \ + awk 'BEGIN{FS=","; OFS=","} (NR>1) { + split($2, a, "[- :]"); + print $1 "," mktime(a[1]" "a[2]" "a[3]" "a[4]" "a[5]" "a[6]); + }' | \ + awk 'BEGIN{FS=","; OFS=","} { + c[$1]++; + max[$1] = (!($1 in max) || $2 > max[$1]) ? $2 : max[$1]; + } END { + for(i in c) { + print i, max[i]; + } + }' diff --git a/sql/snippets/insert_range_of_integers.sql b/sql/snippets/insert_range_of_integers.sql @@ -0,0 +1,8 @@ +INSERT INTO some_table +SELECT @ROW := @ROW + 1 AS ROW, 'whatever', NOW(), NOW() +FROM some_big_table t + JOIN (SELECT @ROW := 1000) t2 +LIMIT 100; +--insert values 1000 to 1100. As long as some_big_table has at least that many rows, this method will work. + +-- Assume some_table has columns id INT, name VARCHAR(100), stamp_created DATETIME, stamp_modified DATETIME diff --git a/sql/snippets/locking_status_monitor.sql b/sql/snippets/locking_status_monitor.sql @@ -0,0 +1,14 @@ +SELECT RPAD(concat(t.trx_id,':',t.trx_mysql_thread_id),30,' ') as 'trx_id:trx_mysql_thread_id ', + ifnull(concat('by:',l.blocking_lock_id),'no') as locked, + RPAD(IFNULL(CONCAT(processlist_user,'@',processlist_host,':',PROCESSLIST_DB),''),50,' ') AS 'user_db + ', + RPAD(t.trx_state,10,' ') as 'state ', + RPAD(t.trx_started,20,' ') as 'started ', + RPAD(timediff(now(), t.trx_started),10,' ') as 'runtime ', + RPAD(t.trx_weight,10,' ') as 'trx_weight', + t.trx_query +FROM information_schema.innodb_trx t + INNER JOIN performance_schema.threads p ON p.processlist_id=t.trx_mysql_thread_id + LEFT JOIN information_schema.innodb_lock_waits l ON l.requesting_trx_id=t.trx_id + #WHERE t.trx_query is NULL +ORDER by t.trx_started; diff --git a/sql/snippets/read_binlog.sh b/sql/snippets/read_binlog.sh @@ -0,0 +1 @@ +mysqlbinlog --base64-output=decode-rows -v bin-mariadb.000001 diff --git a/sql/snippets/table_sizes.sql b/sql/snippets/table_sizes.sql @@ -0,0 +1,4 @@ +SELECT table_name AS `Table`, ROUND(((data_length + index_length) / 1024 / 1024), 2) `Size in MB` +FROM information_schema.TABLES +WHERE table_schema = "db_name" +ORDER BY `Size in MB`; diff --git a/sql/snippets/trim_table_from_old_data.sql b/sql/snippets/trim_table_from_old_data.sql @@ -0,0 +1,22 @@ +-- add to crontab so that it can clean up from time to time +DROP procedure IF EXISTS `sp_super_delete_table_x`; + +DELIMITER $$ + +CREATE PROCEDURE `sp_super_delete_table_x`() +MODIFIES SQL DATA +BEGIN + +REPEAT + DO sleep(2); + DELETE FROM `some_schema`.`x` + WHERE date_entered < DATE_SUB(CURRENT_DATE(), INTERVAL 3 month) + ORDER BY date_entered + LIMIT 20000; -- or whatever makes sense + UNTIL row_count() = 0 +END REPEAT; + + +END $$ + +DELIMITER ; diff --git a/sql/snippets/update_ranks.sql b/sql/snippets/update_ranks.sql @@ -0,0 +1,11 @@ +SET @ROW := 1; + +UPDATE some_table t +JOIN ( + SELECT id + FROM some_other_table + WHERE some_value IN (111,123) -- Optional filtering + ORDER BY id -- Important: define row order + LIMIT 10000 +) ordered ON t.id = ordered.id +SET t.rank = (@ROW := @ROW + 1);