Merge and aggregate datasets

From Rosetta Code
Revision as of 07:19, 8 December 2020 by Petelomax (talk | contribs) (added related tasks)
Task
Merge and aggregate datasets
You are encouraged to solve this task according to the task description, using any language you may know.

Merge and aggregate datasets

Task

Merge and aggregate two datasets as provided in .csv files into a new resuling dataset. Use the appropriate methods and data structures depending on the programming language. Use the most common libraries only when built-in functionality is not sufficient. Either load the data from the .csv files or create the required data structures hard-coded.

patients.csv file contents:

PATIENT_ID,LASTNAME
1001,Hopper
4004,Wirth
3003,Kemeny
2002,Gosling
5005,Kurtz

visits.csv file contents:

PATIENT_ID,VISIT_DATE,SCORE
2002,2020-09-10,6.8
1001,2020-09-17,5.5
4004,2020-09-24,8.4
2002,2020-10-08,
1001,,6.6
3003,2020-11-12,
4004,2020-11-05,7.0
1001,2020-11-19,5.3

Create a resulting dataset in-memory or output it to screen or file, whichever is appropriate for the programming language at hand. Merge and group per patient id and last name, get the maximum visit date and get the sum and average of the scores per patient to get the resulting datasset. Note that the visit date is purposefully provided as ISO format, so that it could also be processed as text and sorted alphabetically to determine the maximum date.

| PATIENT_ID | LASTNAME | LAST_VISIT |  SCORE_SUM | SCORE_AVG |
| 1001       |  Hopper  | 2020-11-19 |  17.4      | 5.80      |
| 2002       |  Gosling | 2020-10-08 |   6.8      | 6.80      |
| 3003       |  Kemeny  | 2020-11-12 |            |           |
| 4004       |  Wirth   | 2020-11-05 |  15.4      | 7.70      |
| 5005       |  Kurtz   |            |            |           |

Note

this task is aimed in particular at programming languages that are used in data science and data processing, suchs as Python, R, SPSS, MATLAB etc. General programming languages such as BASIC, C++, Java etc. may not be suited as well for this task, although everyone is free to add examples as they see fit.

Related tasks


Go

Translation of: Wren

<lang go>package main

import (

   "fmt"
   "math"
   "sort"

)

type Patient struct {

   id       int
   lastName string

}

// maps an id to a lastname var patientDir = make(map[int]string)

// maintains a sorted list of ids var patientIds []int

func patientNew(id int, lastName string) Patient {

   patientDir[id] = lastName
   patientIds = append(patientIds, id)
   sort.Ints(patientIds)
   return Patient{id, lastName}

}

type DS struct {

   dates  []string
   scores []float64

}

type Visit struct {

   id    int
   date  string
   score float64

}

// maps an id to lists of dates and scores var visitDir = make(map[int]DS)

func visitNew(id int, date string, score float64) Visit {

   if date == "" {
       date = "0000-00-00"
   }
   v, ok := visitDir[id]
   if ok {
       v.dates = append(v.dates, date)
       v.scores = append(v.scores, score)
       visitDir[id] = DS{v.dates, v.scores}
   } else {
       visitDir[id] = DS{[]string{date}, []float64{score}}
   }
   return Visit{id, date, score}

}

type Merge struct{ id int }

func (m Merge) lastName() string { return patientDir[m.id] } func (m Merge) dates() []string { return visitDir[m.id].dates } func (m Merge) scores() []float64 { return visitDir[m.id].scores }

func (m Merge) lastVisit() string {

   dates := m.dates()
   dates2 := make([]string, len(dates))
   copy(dates2, dates)
   sort.Strings(dates2)
   return dates2[len(dates2)-1]

}

func (m Merge) scoreSum() float64 {

   sum := 0.0
   for _, score := range m.scores() {
       if score != -1 {
           sum += score
       }
   }
   return sum

}

func (m Merge) scoreAvg() float64 {

   count := 0
   for _, score := range m.scores() {
       if score != -1 {
           count++
       }
   }
   return m.scoreSum() / float64(count)

}

func mergePrint(merges []Merge) {

   fmt.Println("| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG |")
   f := "| %d       | %-7s  | %s | %4s      | %4s      |\n"
   for _, m := range merges {
       _, ok := visitDir[m.id]
       if ok {
           lv := m.lastVisit()
           if lv == "0000-00-00" {
               lv = "          "
           }
           scoreSum := m.scoreSum()
           ss := fmt.Sprintf("%4.1f", scoreSum)
           if scoreSum == 0 {
               ss = "    "
           }
           scoreAvg := m.scoreAvg()
           sa := "    "
           if !math.IsNaN(scoreAvg) {
               sa = fmt.Sprintf("%4.2f", scoreAvg)
           }
           fmt.Printf(f, m.id, m.lastName(), lv, ss, sa)
       } else {
           fmt.Printf(f, m.id, m.lastName(), "          ", "    ", "    ")
       }
   }

}

func main() {

   patientNew(1001, "Hopper")
   patientNew(4004, "Wirth")
   patientNew(3003, "Kemeny")
   patientNew(2002, "Gosling")
   patientNew(5005, "Kurtz")
   visitNew(2002, "2020-09-10", 6.8)
   visitNew(1001, "2020-09-17", 5.5)
   visitNew(4004, "2020-09-24", 8.4)
   visitNew(2002, "2020-10-08", -1) // -1 signifies no score
   visitNew(1001, "", 6.6)          // "" signifies no date
   visitNew(3003, "2020-11-12", -1)
   visitNew(4004, "2020-11-05", 7.0)
   visitNew(1001, "2020-11-19", 5.3)
   merges := make([]Merge, len(patientIds))
   for i, id := range patientIds {
       merges[i] = Merge{id}
   }
   mergePrint(merges)

}</lang>

Output:
| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG |
| 1001       | Hopper   | 2020-11-19 | 17.4      | 5.80      |
| 2002       | Gosling  | 2020-10-08 |  6.8      | 6.80      |
| 3003       | Kemeny   | 2020-11-12 |           |           |
| 4004       | Wirth    | 2020-11-05 | 15.4      | 7.70      |
| 5005       | Kurtz    |            |           |           |

Julia

Translation of: Python

<lang julia>using CSV, DataFrames, Statistics

  1. load data from csv files
  2. df_patients = CSV.read("patients.csv", DataFrame)
  3. df_visits = CSV.read("visits.csv", DataFrame)
  1. create DataFrames from text that is hard coded, so use IOBuffer(String) as input

str_patients = IOBuffer("""PATIENT_ID,LASTNAME 1001,Hopper 4004,Wirth 3003,Kemeny 2002,Gosling 5005,Kurtz """) df_patients = CSV.read(str_patients, DataFrame) str_visits = IOBuffer("""PATIENT_ID,VISIT_DATE,SCORE 2002,2020-09-10,6.8 1001,2020-09-17,5.5 4004,2020-09-24,8.4 2002,2020-10-08, 1001,,6.6 3003,2020-11-12, 4004,2020-11-05,7.0 1001,2020-11-19,5.3 """) df_visits = CSV.read(str_visits, DataFrame)

  1. merge on PATIENT_ID, using an :outer join or we lose Kurtz, who has no data, sort by name

df_merge = sort(join(df_patients, df_visits, on="PATIENT_ID", kind=:outer), (:LASTNAME,))

fnonmissing(a, f) = isempty(a) ? [] : isempty(skipmissing(a)) ? a[1] : f(skipmissing(a))

  1. group by patient id / last name and then aggregate to get latest visit and mean score

df_result = by(df_merge, [:PATIENT_ID, :LASTNAME]) do df

   DataFrame(LATEST_VISIT = fnonmissing(df[:VISIT_DATE], maximum),
             SUM_SCORE = fnonmissing(df[:SCORE], sum),
             MEAN_SCORE = fnonmissing(df[:SCORE], mean))

end println(df_result)

</lang>

Output:
5×5 DataFrame
│ Row │ PATIENT_ID │ LASTNAME │ LATEST_VISIT │ SUM_SCORE │ MEAN_SCORE │
│     │ Int64?     │ String?  │ Dates.Date?  │ Float64?  │ Float64?   │
├─────┼────────────┼──────────┼──────────────┼───────────┼────────────┤
│ 1   │ 2002       │ Gosling  │ 2020-10-08   │ 6.8       │ 6.8        │
│ 2   │ 1001       │ Hopper   │ 2020-11-19   │ 17.4      │ 5.8        │
│ 3   │ 3003       │ Kemeny   │ 2020-11-12   │ missing   │ missing    │
│ 4   │ 5005       │ Kurtz    │ missing      │ missing   │ missing    │
│ 5   │ 4004       │ Wirth    │ 2020-11-05   │ 15.4      │ 7.7        │

Phix

<lang Phix>constant patients_txt = split(""" PATIENT_ID,LASTNAME 1001,Hopper 4004,Wirth 3003,Kemeny 2002,Gosling 5005,Kurtz""",'\n'),

        visits_txt = split("""

PATIENT_ID,VISIT_DATE,SCORE 2002,2020-09-10,6.8 1001,2020-09-17,5.5 4004,2020-09-24,8.4 2002,2020-10-08, 1001,,6.6 3003,2020-11-12, 4004,2020-11-05,7.0 1001,2020-11-19,5.3""",'\n') --or, assuming these files contain exactly the same actual raw text, -- and the use of GT_LF_STRIPPED is instead of above split('\n'): --constant patients_txt = get_text("patients.csv",GT_LF_STRIPPED), -- visits_txt = get_text("visits.csv",GT_LF_STRIPPED)

function sap(string s) return split(s,',')&{"",0,0} end function function svp(string s) return split(s,',') end function sequence patient_data = sort(apply(patients_txt[2..$],sap)),

        visit_data = sort_columns(apply(visits_txt[2..$],svp),{1,-2})

visit_data = append(visit_data,{"","","0"}) -- (add a sentinel) string last_id = "",id,name,dt,scstr,avstr atom score,score_total,average integer visit_count = 0, pdx = 1 for i=1 to length(visit_data) do

   {id,dt,scstr} = visit_data[i]
   score = iff(scstr=""?0:scanf(scstr,"%f")[1][1])
   if id!=last_id then
       if visit_count then
           average = score_total/visit_count
           patient_data[pdx][4..5] = {score_total,average}
       end if
       if i=length(visit_data) then exit end if -- (sentinel)
       score_total = score
       visit_count = 1
       while id!=patient_data[pdx][1] do pdx += 1 end while
       patient_data[pdx][3] = dt
       last_id = id
   else
       score_total += score
       visit_count += 1
   end if

end for printf(1,"| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG |\n") for i=1 to length(patient_data) do

   {id,name,dt,score,average} = patient_data[i]
   scstr = iff(score=0?"":sprintf("%4.1f",score))
   avstr = iff(average=0?"":sprintf("%4.2f",average))
   printf(1,"| %-10s |  %-7s | %10s |  %-9s | %-9s |\n",
           {id,name,dt,scstr,avstr})

end for</lang>

Output:
| PATIENT_ID | LASTNAME | LAST_VISIT |  SCORE_SUM | SCORE_AVG |
| 1001       |  Hopper  | 2020-11-19 |  17.4      | 5.80      |
| 2002       |  Gosling | 2020-10-08 |   6.8      | 3.40      |
| 3003       |  Kemeny  | 2020-11-12 |            |           |
| 4004       |  Wirth   | 2020-11-05 |  15.4      | 7.70      |
| 5005       |  Kurtz   |            |            |           |

Perl

Weird requirement: "Use the most common libraries only when built-in functionality is not sufficient." Not even a "use strict;" :) <lang perl>#!/usr/bin/perl my $fmt = '| %-11s' x 5 . "|\n"; printf $fmt, qw( PATIENT_ID LASTNAME LAST_VISIT SCORE_SUM SCORE_AVG); my ($names, $visits) = do { local $/; split /^\n/m, }; my %score; for ( $visits =~ /^\d.*/gm )

 {
 my ($id, undef, $score) = split /,/;
 $score{$id} //= [, ];
 $score and $score{$id}[0]++, $score{$id}[1] += $score;
 }

for ( sort $names =~ /^\d.*/gm )

 {
 my ($id, $name) = split /,/;
 printf $fmt, $id, $name, ( sort $visits =~ /^$id,(.*?),/gm,  )[-1],
   $score{$id}[0]
     ? ( $score{$id}[1], $score{$id}[1] / $score{$id}[0])
     : (, );
 }

__DATA__ PATIENT_ID,LASTNAME 1001,Hopper 4004,Wirth 3003,Kemeny 2002,Gosling 5005,Kurtz

PATIENT_ID,VISIT_DATE,SCORE 2002,2020-09-10,6.8 1001,2020-09-17,5.5 4004,2020-09-24,8.4 2002,2020-10-08, 1001,,6.6 3003,2020-11-12, 4004,2020-11-05,7.0 1001,2020-11-19,5.3</lang>

Output:
| PATIENT_ID | LASTNAME   | LAST_VISIT | SCORE_SUM  | SCORE_AVG  |
| 1001       | Hopper     | 2020-11-19 | 17.4       | 5.8        |
| 2002       | Gosling    | 2020-10-08 | 6.8        | 6.8        |
| 3003       | Kemeny     | 2020-11-12 |            |            |
| 4004       | Wirth      | 2020-11-05 | 15.4       | 7.7        |
| 5005       | Kurtz      |            |            |            |

Python

<lang Python># to install pandas library goto cmd prompt and type:

  1. cd %USERPROFILE%\AppData\Local\Programs\Python\Python38-32\Scripts\
  2. pip install pandas

import pandas as pd

  1. load data from csv files

df_patients = pd.read_csv (r'patients.csv', sep = ",", decimal=".") df_visits = pd.read_csv (r'visits.csv', sep = ",", decimal=".")

# load data hard coded, create data frames import io str_patients = """PATIENT_ID,LASTNAME 1001,Hopper 4004,Wirth 3003,Kemeny 2002,Gosling 5005,Kurtz """ df_patients = pd.read_csv(io.StringIO(str_patients), sep = ",", decimal=".") str_visits = """PATIENT_ID,VISIT_DATE,SCORE 2002,2020-09-10,6.8 1001,2020-09-17,5.5 4004,2020-09-24,8.4 2002,2020-10-08, 1001,,6.6 3003,2020-11-12, 4004,2020-11-05,7.0 1001,2020-11-19,5.3 """ df_visits = pd.read_csv(io.StringIO(str_visits), sep = ",", decimal=".")

  1. typecast from string to datetime so .agg can 'max' it

df_visits['VISIT_DATE'] = pd.to_datetime(df_visits['VISIT_DATE'])

  1. merge on PATIENT_ID

df_merge = df_patients.merge(df_visits, on='PATIENT_ID', how='left')

  1. groupby is an intermediate object

df_group = df_merge.groupby(['PATIENT_ID','LASTNAME'], as_index=False)

  1. note: you can use 'sum' instead of the lambda function but that returns NaN as 0 (zero)

df_result = df_group.agg({'VISIT_DATE': 'max', 'SCORE': [lambda x: x.sum(min_count=1),'mean']})

print(df_result)</lang>

  PATIENT_ID LASTNAME LAST_VISIT      SCORE
                             max <lambda_0> mean
0       1001   Hopper 2020-11-19       17.4  5.8
1       2002  Gosling 2020-10-08        6.8  6.8
2       3003   Kemeny 2020-11-12        NaN  NaN
3       4004    Wirth 2020-11-05       15.4  7.7
4       5005    Kurtz        NaT        NaN  NaN

Python: Stdlib only

Using only standard libraries and input from csv files.

<lang python>import csv

fnames = 'patients.csv patients_visits.csv'.split()

def csv2list(fname):

   with open(fname) as f:
       rows = list(csv.reader(f))
   return rows

patients, visits = data = [csv2list(fname) for fname in fnames] result = [record.copy() for record in patients] result[1:] = sorted(result[1:])

  1. %%

result[0].append('LAST_VISIT') last = {p: vis for p, vis, *score in visits[1:]} for record in result[1:]:

   p = record[0]
   record.append(last.get(p, ))
  1. %%

result[0] += ['SCORE_SUM', 'SCORE_AVG'] n = {p: 0 for p, *_ in patients[1:]} tot = n.copy() for record in visits[1:]:

   p, _, score = record
   if score:
       n[p] += 1
       tot[p] += float(score)

for record in result[1:]:

   p = record[0]
   if n[p]:
       record += [f"{tot[p]:5.1f}", f"{tot[p] / n[p]:5.2f}"]
   else:
       record += [, ]
  1. %%

for record in result:

   print(f"| {' | '.join(f'{r:^10}' for r in record)} |")</lang>
Output:
| PATIENT_ID |  LASTNAME  | LAST_VISIT | SCORE_SUM  | SCORE_AVG  |
|    1001    |   Hopper   | 2020-11-19 |    17.4    |    5.80    |
|    2002    |  Gosling   | 2020-10-08 |     6.8    |    6.80    |
|    3003    |   Kemeny   | 2020-11-12 |            |            |
|    4004    |   Wirth    | 2020-11-05 |    15.4    |    7.70    |
|    5005    |   Kurtz    |            |            |            |

R

Example is incomplete, last visit date is cast as a number. <lang R># load data from csv files

  1. setwd("C:\Temp\csv\")
  2. df_patient <- read.csv(file="patients.csv", header = TRUE, sep = ",")
  3. df_visits <- read.csv(file="visits.csv", header = TRUE, sep = ",", dec = ".", colClasses=c("character","Date","numeric"))
  1. load data hard coded, create data frames

df_patient <- read.table(text = " PATIENT_ID,LASTNAME 1001,Hopper 4004,Wirth 3003,Kemeny 2002,Gosling 5005,Kurtz ", header = TRUE, sep = ",") # character fields so no need for extra parameters colClasses etc.

df_visits <- read.table(text = " PATIENT_ID,VISIT_DATE,SCORE 2002,2020-09-10,6.8 1001,2020-09-17,5.5 4004,2020-09-24,8.4 2002,2020-10-08, 1001,,6.6 3003,2020-11-12, 4004,2020-11-05,7.0 1001,2020-11-19,5.3 ", header = TRUE, dec = ".", sep = ",", colClasses=c("character","Date","numeric"))

  1. aggregate visit date and scores

df_agg <- data.frame(

 cbind(
   PATIENT_ID = names(tapply(df_visits$VISIT_DATE, list(df_visits$PATIENT_ID), max, na.rm=TRUE)),
   last_visit = tapply(df_visits$VISIT_DATE, list(df_visits$PATIENT_ID), max, na.rm=TRUE),
   score_sum = tapply(df_visits$SCORE, list(df_visits$PATIENT_ID), sum, na.rm=TRUE),
   score_avg = tapply(df_visits$SCORE, list(df_visits$PATIENT_ID), mean, na.rm=TRUE)
 )

)

  1. merge patients and aggregate dataset
  2. all.x = all the non matching cases of df_patient are appended to the result as well (i.e. 'left join')

df_result <- merge(df_patient, df_agg, by = 'PATIENT_ID', all.x = TRUE)

print(df_result)</lang>

Output:
  PATIENT_ID LASTNAME last_visit score_sum score_avg
1       1001   Hopper      18585      17.4       5.8
2       2002  Gosling      18543       6.8       6.8
3       3003   Kemeny      18578         0       NaN
4       4004    Wirth      18571      15.4       7.7
5       5005    Kurtz       <NA>      <NA>      <NA>

Raku

<lang perl6>my @names = map { ( <PATIENT_ID LASTNAME> Z=> .list ).hash },

   ( 1001, 'Hopper'  ),
   ( 4004, 'Wirth'   ),
   ( 3003, 'Kemeny'  ),
   ( 2002, 'Gosling' ),
   ( 5005, 'Kurtz'   ),

my @visits = map { ( <PATIENT_ID VISIT_DATE SCORE> Z=> .list ).hash },

   ( 2002, '2020-09-10', 6.8 ),
   ( 1001, '2020-09-17', 5.5 ),
   ( 4004, '2020-09-24', 8.4 ),
   ( 2002, '2020-10-08', Nil ),
   ( 1001,         Nil , 6.6 ),
   ( 3003, '2020-11-12', Nil ),
   ( 4004, '2020-11-05', 7.0 ),
   ( 1001, '2020-11-19', 5.3 ),

my %v = @visits.classify: *.<PATIENT_ID>;

my @result = gather for @names -> %n {

   my @p = %v{ %n.<PATIENT_ID> }<>;
   my @dates  = @p».<VISIT_DATE>.grep: *.defined;
   my @scores = @p».<     SCORE>.grep: *.defined;
   take {
       %n,
       LAST_VISIT => ( @dates.max          if @dates  ),
       SCORE_AVG  => ( @scores.sum/@scores if @scores ),
       SCORE_SUM  => ( @scores.sum         if @scores ),
   };

}

my @out_field_names = <PATIENT_ID LASTNAME LAST_VISIT SCORE_SUM SCORE_AVG>; my @rows = @result.sort(*.<PATIENT_ID>).map(*.{@out_field_names}); say .map({$_ // }).fmt('%-10s', ' | ') for @out_field_names, |@rows;</lang>

Output:
PATIENT_ID | LASTNAME   | LAST_VISIT | SCORE_SUM  | SCORE_AVG 
1001       | Hopper     | 2020-11-19 | 17.4       | 5.8       
2002       | Gosling    | 2020-10-08 | 6.8        | 6.8       
3003       | Kemeny     | 2020-11-12 |            |           
4004       | Wirth      | 2020-11-05 | 15.4       | 7.7       
5005       | Kurtz      |            |            |           

SPSS

Note: the PATIENT_ID 5005 is missing from the result, because MATCH FILES cannot add rows for missing keys, i.e. AFAIK a "left join" is not possible is SPSS.

<lang SPSS>* set working directory to location of .csv files

CD 'C:\Temp\csv\'.

  • load patients csv data

GET DATA /TYPE=TXT

 /FILE="patients.csv"
 /ENCODING='UTF8'
 /DELCASE=LINE
 /DELIMITERS=","
 /QUALIFIER='"'
 /ARRANGEMENT=DELIMITED
 /FIRSTCASE=2
 /IMPORTCASE=ALL
 /VARIABLES=

PATIENT_ID F5.0 LASTNAME A20 . CACHE. EXECUTE.

  • sort cases is needed to match files

SORT CASES BY PATIENT_ID (A). DATASET NAME Patients WINDOW=FRONT.

  • load visits csv data

GET DATA /TYPE=TXT

 /FILE="visit.csv"
 /ENCODING='UTF8'
 /DELCASE=LINE
 /DELIMITERS=","
 /QUALIFIER='"'
 /ARRANGEMENT=DELIMITED
 /FIRSTCASE=2
 /IMPORTCASE=ALL
 /VARIABLES=

PATIENT_ID F5.0 VISIT_DATE SDATE10 SCORE F4.1 . CACHE. EXECUTE.

  • sort cases is needed, else match files will raise error "Files out of order"

SORT CASES BY PATIENT_ID (A) VISIT_DATE (A). DATASET NAME Visits WINDOW=FRONT.

  • load visits csv data
  • merge datasets, one to many, FILE is the 'one', TABLE is 'many'

MATCH FILES TABLE = Patients / FILE = Visits /BY PATIENT_ID. EXECUTE.

  • aggregate visit date and scores, group by and order (A)=ascending or (D)=descending

AGGREGATE OUTFILE * /BREAK=PATIENT_ID(A) /last_visit = MAX(VISIT_DATE) /score_avg = MEAN(SCORE) /score_sum = SUM(SCORE).</lang>

Output:
| PATIENT_ID | LASTNAME | last_visit | score_sum | score_avg |
| 1001       |  Hopper  | 2020-11-19 |      17.4 |       5.8 |
| 2002       |  Gosling | 2020-10-08 |       6.8 |       6.8 |
| 3003       |  Kemeny  | 2020-11-12 |         . |         . |
| 4004       |  Wirth   | 2020-11-05 |      15.4 |       7.7 |

SQL

<lang SQL>-- drop tables DROP TABLE IF EXISTS tmp_patients; DROP TABLE IF EXISTS tmp_visits;

-- create tables CREATE TABLE tmp_patients( PATIENT_ID INT, LASTNAME VARCHAR(20) );

CREATE TABLE tmp_visits( PATIENT_ID INT, VISIT_DATE DATE, SCORE NUMERIC(4,1) );

-- load data from csv files /* -- Note: LOAD DATA LOCAL requires `local-infile` enabled on both the client and server else you get error "#1148 command is not allowed.." LOAD DATA LOCAL INFILE '/home/csv/patients.csv' INTO TABLE `tmp_patients` FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' IGNORE 1 LINES; LOAD DATA LOCAL INFILE '/home/csv/visits.csv' INTO TABLE `tmp_visits` FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' IGNORE 1 LINES;

  • /

-- load data hard coded INSERT INTO tmp_patients(PATIENT_ID, LASTNAME) VALUES (1001, 'Hopper'), (4004, 'Wirth'), (3003, 'Kemeny'), (2002, 'Gosling'), (5005, 'Kurtz');

INSERT INTO tmp_visits(PATIENT_ID, VISIT_DATE, SCORE) VALUES (2002, '2020-09-10', 6.8), (1001, '2020-09-17', 5.5), (4004, '2020-09-24', 8.4), (2002, '2020-10-08', NULL), (1001, NULL, 6.6), (3003, '2020-11-12', NULL), (4004, '2020-11-05', 7.0), (1001, '2020-11-19', 5.3);

-- join tables and group SELECT p.PATIENT_ID, p.LASTNAME, MAX(VISIT_DATE) AS LAST_VISIT, SUM(SCORE) AS SCORE_SUM, CAST(AVG(SCORE) AS DECIMAL(10,2)) AS SCORE_AVG FROM tmp_patients p LEFT JOIN tmp_visits v ON v.PATIENT_ID = p.PATIENT_ID GROUP BY p.PATIENT_ID, p.LASTNAME ORDER BY p.PATIENT_ID;</lang>

Output:
PATIENT_ID  LASTNAME  LAST_VISIT  SCORE_SUM  SCORE_AVG
1001        Hopper    2020-11-19       17.4       5.80
2002        Gosling   2020-10-08        6.8       6.80
3003        Kemeny    2020-11-12       NULL       NULL
4004        Wirth     2020-11-05       15.4       7.70
5005        Kurtz     NULL             NULL       NULL

Wren

Library: Wren-sort
Library: Wren-fmt

<lang ecmascript>import "/fmt" for Fmt import "/sort" for Sort

class Patient {

   construct new(id, lastName) {
       _id = id
       _lastName = lastName
       if (!__dir) __dir = {}
       __dir[id] = lastName
       if (!__ids) {
           __ids = [id]
       } else {
           __ids.add(id)
           Sort.insertion(__ids)
       }
   }
   id       { _id }
   lastName { _lastName }
   // maps an id to a lastname
   static dir { __dir }
   // maintains a sorted list of ids
   static ids { __ids }

}

class Visit {

   construct new(id, date, score) {
       _id = id
       _date = date || "0000-00-00"
       _score = score
       if (!__dir) __dir = {}
       if (!__dir[id]) {
           __dir[id] = [ [_date], [score] ]
       } else {
           __dir[id][0].add(_date)
           __dir[id][1].add(score)
       }
   }
   id    { _id }
   date  { _date }
   score { _score }
    // maps an id to lists of dates and scores
   static dir { __dir }

}

class Merge {

   construct new(id) {
       _id = id
   }
   id        { _id }
   lastName  { Patient.dir[_id] }
   dates     { Visit.dir[_id][0] }
   scores    { Visit.dir[_id][1] }
   lastVisit { Sort.merge(dates)[-1] }
   scoreSum  { scores.reduce(0) { |acc, s| s ? acc + s : acc } }
   scoreAvg  { scoreSum / scores.count { |s| s } }
   static print(merges) {
       System.print("| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG |")
       var fmt = "| $d       | $-7s  | $s | $4s      | $4s      |"
       for (m in merges) {
           if (Visit.dir[m.id]) {
               var lv = (m.lastVisit != "0000-00-00") ? m.lastVisit : "          "
               var ss = (m.scoreSum > 0) ? Fmt.f(4, m.scoreSum, 1) : "    "
               var sa = (!m.scoreAvg.isNan) ? Fmt.f(4, m.scoreAvg, 2) : "    " 
               Fmt.print(fmt, m.id, m.lastName, lv, ss, sa)
           } else {
               Fmt.print(fmt, m.id, m.lastName, "          ", "    ", "    ")
           }
       }
   }

}

Patient.new(1001, "Hopper") Patient.new(4004, "Wirth") Patient.new(3003, "Kemeny") Patient.new(2002, "Gosling") Patient.new(5005, "Kurtz")

Visit.new(2002, "2020-09-10", 6.8) Visit.new(1001, "2020-09-17", 5.5) Visit.new(4004, "2020-09-24", 8.4) Visit.new(2002, "2020-10-08", null) Visit.new(1001, null , 6.6) Visit.new(3003, "2020-11-12", null) Visit.new(4004, "2020-11-05", 7.0) Visit.new(1001, "2020-11-19", 5.3)

var merges = Patient.ids.map { |id| Merge.new(id) }.toList Merge.print(merges)</lang>

Output:
| PATIENT_ID | LASTNAME | LAST_VISIT | SCORE_SUM | SCORE_AVG |
| 1001       | Hopper   | 2020-11-19 | 17.4      | 5.80      |
| 2002       | Gosling  | 2020-10-08 |  6.8      | 6.80      |
| 3003       | Kemeny   | 2020-11-12 |           |           |
| 4004       | Wirth    | 2020-11-05 | 15.4      | 7.70      |
| 5005       | Kurtz    |            |           |           |