Merge and aggregate datasets: Difference between revisions

→‎{{header|Haskell}}: added solution
(Add Transd)
(→‎{{header|Haskell}}: added solution)
Line 583:
| 5005 | Kurtz | | | |
</pre>
 
=={{header|Haskell}}==
 
===Reading and merging===
 
Merging of fields and databases is defined as a monoid operation for corresponding types.
 
<lang haskell>import Data.List
import Data.Maybe (mapMaybe, maybeToList)
import System.IO (readFile)
import Text.Read (readMaybe)
import Control.Applicative ((<|>))
 
------------------------------------------------------------
 
newtype DB = DB { entries :: [Patient] }
deriving Show
 
instance Semigroup DB where
DB a <> DB b = normalize $ a <> b
 
instance Monoid DB where
mempty = DB []
 
normalize :: [Patient] -> DB
normalize = DB
. map mconcat
. groupBy (\x y -> pid x == pid y)
. sortOn pid
------------------------------------------------------------
 
data Patient = Patient { pid :: String
, name :: Maybe String
, visits :: [String]
, scores :: [Float] }
deriving Show
 
instance Semigroup Patient where
p1 <> p2 = Patient
(if null (pid p1) then pid p2 else pid p1)
(name p1 <|> name p2)
(visits p1 <|> visits p2)
(scores p2 <|> scores p1)
 
instance Monoid Patient where
mempty = Patient mempty mempty mempty mempty
------------------------------------------------------------
 
readDB :: String -> DB
readDB = normalize
. mapMaybe readPatient
. readCSV
 
readPatient r = do
i <- lookup "PATIENT_ID" r
let n = lookup "LASTNAME" r
let d = lookup "VISIT_DATE" r >>= readDate
let s = lookup "SCORE" r >>= readMaybe
return $ Patient i n (maybeToList d) (maybeToList s)
where
readDate [] = Nothing
readDate d = Just d
 
readCSV :: String -> [(String, String)]
readCSV txt = zip header <$> body
where
header:body = splitBy ',' <$> lines txt
splitBy ch = unfoldr go
where
go [] = Nothing
go s = Just $ drop 1 <$> span (/= ch) s</lang>
 
<pre>let patients = readDB <$> readFile "patients.csv"
*Main> let visits = readDB <$> readFile "visits.csv"
 
*Main> mapM_ print . entries =<< patients
Patient {pid = "1001", name = Just "Hopper", visits = [], scores = []}
Patient {pid = "2002", name = Just "Gosling", visits = [], scores = []}
Patient {pid = "3003", name = Just "Kemeny", visits = [], scores = []}
Patient {pid = "4004", name = Just "Wirth", visits = [], scores = []}
Patient {pid = "5005", name = Just "Kurtz", visits = [], scores = []}
 
*Main> mapM_ print . entries =<< visits
Patient {pid = "1001", name = Nothing, visits = ["2020-09-17","2020-11-19"], scores = [5.3,6.6,5.5]}
Patient {pid = "2002", name = Nothing, visits = ["2020-09-10","2020-10-08"], scores = [6.8]}
Patient {pid = "3003", name = Nothing, visits = ["2020-11-12"], scores = []}
Patient {pid = "4004", name = Nothing, visits = ["2020-09-24","2020-11-05"], scores = [7.0,8.4]}
 
*Main> mapM_ print . entries =<< patients <> visits
Patient {pid = "1001", name = Just "Hopper", visits = ["2020-09-17","2020-11-19"], scores = [5.3,6.6,5.5]}
Patient {pid = "2002", name = Just "Gosling", visits = ["2020-09-10","2020-10-08"], scores = [6.8]}
Patient {pid = "3003", name = Just "Kemeny", visits = ["2020-11-12"], scores = []}
Patient {pid = "4004", name = Just "Wirth", visits = ["2020-09-24","2020-11-05"], scores = [7.0,8.4]}
Patient {pid = "5005", name = Just "Kurtz", visits = [], scores = []}</pre>
 
===Pretty tabulation===
 
<lang haskell>
tabulateDB (DB ps) header cols = intercalate "|" <$> body
where
body = transpose $ zipWith pad width table
table = transpose $ header : map showPatient ps
showPatient p = sequence cols p
width = maximum . map length <$> table
pad n col = (' ' :) . take (n+1) . (++ repeat ' ') <$> col
 
main = do
a <- readDB <$> readFile "patients.csv"
b <- readDB <$> readFile "visits.csv"
mapM_ putStrLn $ tabulateDB (a <> b) header fields
where
header = [ "PATIENT_ID", "LASTNAME", "VISIT_DATE"
, "SCORES SUM","SCORES AVG"]
fields = [ pid
, \p -> case name p of {Nothing -> []; Just n -> n}
, \p -> case visits p of {[] -> []; l -> last l}
, \p -> case scores p of {[] -> []; s -> show (sum s)}
, \p -> case scores p of {[] -> []; s -> show (mean s)} ]
 
mean lst = sum lst / genericLength lst</lang>
 
<pre>*Main> main
PATIENT_ID | LASTNAME | VISIT_DATE | SCORES SUM | SCORES AVG
1001 | Hopper | 2020-11-19 | 17.4 | 5.7999997
2002 | Gosling | 2020-10-08 | 6.8 | 6.8
3003 | Kemeny | 2020-11-12 | |
4004 | Wirth | 2020-11-05 | 15.4 | 7.7
5005 | Kurtz | | | </pre>
 
=={{header|jq}}==
Anonymous user