bash - Group partial matches in each column side by side in csv file - TagMerge
2Group partial matches in each column side by side in csv fileGroup partial matches in each column side by side in csv file

Group partial matches in each column side by side in csv file

Asked 1 years ago
0
2 answers

It may be described as "Fuzzy Matching".

Python comes with a simple difflib.get_close_matches() function.

#!/usr/bin/python3
import csv, difflib, operator, sys, tabulate, datetime
from   collections import defaultdict

date = datetime.datetime.now().strftime('%m-%d-%Y')
input_file = "torrance-austin-diff-%s.csv" % date
output_file = 'output.csv'

with open(input_file, newline='') as csv_in:
    reader = csv.reader(csv_in)

    # read first line containing column names
    colnames = next(reader)

    torrance = defaultdict(set)
    austin   = defaultdict(set)

    # split up into "name":"version"
    for tor, aus in reader:
        if tor:
            if ':' in tor:
                name, version = tor.split(':', maxsplit=1)
            else:
                name, version = tor, ''
            torrance[name].add(version)
        if aus:
            if ':' in aus:
                name, version = aus.split(':', maxsplit=1)
            else:
                name, version = aus, ''
            austin[name].add(version)

matched   = []
unmatched = []

for name in torrance:
    # exact match on "name"
    if name in austin:
        # fuzzy match on "version"
        for version in torrance[name]:
            close = difflib.get_close_matches(version, austin[name], n=1)
            if close:
                close = close[0]
            else:
                close = list(austin[name])[0]
            # remove chosen match from search list
            austin[name].remove(close)
            if version:
                matched.append([
                    ':'.join([name, version]), # austin
                    ':'.join([name, close])    # torrance
                ])
            else:
                matched.append([name, name])
    else:
        # no match 
        for version in torrance[name]:
            if version:
                fullname = ':'.join([name, version])
            else:
                fullname = name
            unmatched.append([fullname, ''])

# add unmatched items remaining in austin
for name, versions in austin.items():
    for version in versions:
        if version:
            fullname = ':'.join([name, version])
        else:
            fullname = name
        unmatched.append(['', fullname])

rows = matched + unmatched

print(tabulate.tabulate(rows, headers=colnames, tablefmt='psql'))

with open(output_file, newline='', mode='w') as csv_out:
    writer = csv.writer(csv_out)
    writer.writerow(colnames)
    writer.writerows(rows)

result:

+╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌+╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌+
| TORRANCE                                                        | AUSTIN                                                         |
|╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌+╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌|
| dsiconfig-init:fm1-051021191732                                 | dsiconfig-init:fm1-051021200915                                |
| lmconsole-ui:6.20-67c59a7-090321115651                          | lmconsole-ui:LoanMagic-6.19-5b7a123                            |
| reporting-service:3.9.6-9291dc5-020122104141                    | reporting-service:3.9.1-dac3d50-120321111212                   |
| customer-expiration-notification-cli:1.2.2-c80493c-011822135312 |                                                                |
| eeligibility-service-cli:master-251852f-042721181924            |                                                                |
| providerservices-mysql-cronjob:master-baddfe9-101121133600      |                                                                |
|                                                                 | cascustomer-service:4.8.0-SNAPSHOT-20210623172337-062321102337 |
|                                                                 | dm-nginx:latest                                                |
+╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌+╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌+

Source: link

0

The simplest approach for achieving that would be typing the following command
cat *csv > combined.csv
awk '(NR == 1) || (FNR > 1)' *.csv > 1000Plus5years_companies_data.csv
Use csvstack from csvkit:
csvstack *.csv  > out.csv
use paste
paste -d ',' file1.csv file2.csv ... fileN.csv
WARNING: This script assumes that all input files have the same number of lines. Output will likely be unusable if any file has a different number of lines from any of the others.
#!/usr/bin/perl

use strict;

my @csv=();

foreach (@ARGV) {
  my $linenum=0;

  open(F,"<",$_) or die "couldn't open $_ for read: $!\n";

  while (<F>) {
    chomp;
    $csv[$linenum++] .= "," . $_;
  };

  close(F);
};

foreach (@csv) {
  s/^,//;   # strip leading comma from line
  print $_,"\n";
};

Source: link

Recent Questions on bash

    Programming Languages