reorganize_groups.awk (2226B)
#!/bin/awk
#backstory: data format from dataprovider came in a weird way, made it necessary to compress
#some rows before adding them to the database, to reduce table sizes and boost JOIN speeds
#this part expands the compressed rows back to their original number of rows
#not the nicest looking code, but it did cut a table by ~60% which resulted in a decent boost on the DB side
#also see group_by_column.awk
# Define helper function to push a value to an array
function push(arr, value) {
arr[length(arr)+1] = value;
}
# Helper to print the array in semicolon-delimited format
function parray(arr) {
for (i = 0; i < length(arr); i++) {
printf arr[i] ";";
}
printf "\n";
}
# Field delimiter = comma, Record delimiter = newline
BEGIN {
FS = ",";
RS = "\n";
}
{
t = $1; # First column is the key
for (x = 2; x <= NF; x++) { # Loop over remaining columns
z = y[0]; # Store previous group prefix for comparison
match($x, /[^@]+/, y); # Extract prefix before @ (the group)
match($x, /@[0-9]+/, w); # Extract value after @
w_ = substr(w[0], 2); # Remove '@' prefix
if (x == 2) {
z = y[0]; # On first loop, set z to current group
}
# If same group, keep appending
if (z == y[0]) {
arr[0] = t; # First element: top-level identifier
arr[1] = z; # Second: group
push(arr, w_); # Then all group values
}
# Group changes: flush and reset
if (z != y[0]) {
parray(arr);
split("", arr); # Clear array
arr[0] = t; arr[1] = y[0];
push(arr, w_);
}
# Final column — always flush and reset
if (x == NF) {
parray(arr);
split("", arr);
next;
}
}
}
END {
printf "\n"; # Clean newline at end
}
#HOW IT WORKS
#cat file1
#dsfs,2@44,2@55,3@66,3@88,3@89,3@90,3@91,4@56,4@777,4@65,5@776,5@545
#afsf,1@00,1@99
#bdsa,2@323,2@212,6@432,6@2188
#awk -f reorganize_groups.awk <FILE>
#dsfs;2;;44;55;
#dsfs;3;;66;88;89;90;91;
#dsfs;4;;56;777;65;
#dsfs;5;;776;545;
#afsf;1;;00;99;
#bdsa;2;;323;212;
#bdsa;6;;432;2188;