diff options
Diffstat (limited to 'examples')
-rw-r--r-- | examples/example-radon/ExampleRadon.hs | 169 | ||||
-rw-r--r-- | examples/example-radon/Plotting.hs | 79 | ||||
-rw-r--r-- | examples/example0.1/Example0_1.hs | 36 | ||||
-rw-r--r-- | examples/example0/Example0.hs | 30 | ||||
-rw-r--r-- | examples/example1/Example1.hs | 75 | ||||
-rw-r--r-- | examples/example2/Example2.hs | 78 |
6 files changed, 467 insertions, 0 deletions
diff --git a/examples/example-radon/ExampleRadon.hs b/examples/example-radon/ExampleRadon.hs new file mode 100644 index 0000000..1695e6b --- /dev/null +++ b/examples/example-radon/ExampleRadon.hs @@ -0,0 +1,169 @@ +{-# LANGUAGE DataKinds #-} +{-# LANGUAGE DeriveAnyClass #-} +{-# LANGUAGE DeriveGeneric #-} +{-# LANGUAGE DuplicateRecordFields #-} +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE OverloadedLabels #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE TypeApplications #-} +{-# LANGUAGE TupleSections #-} +{-# LANGUAGE Arrows #-} +{-# OPTIONS_GHC -Wwarn -Wno-missing-signatures -Wno-name-shadowing #-} + +-- This example is loosely based on the series of blog posts by Thomas Wiecki +-- https://twiecki.io/blog/2014/03/17/bayesian-glms-3/ . + +import Control.Monad +import Data.Aeson +import qualified Data.Csv as Csv +import Data.DocRecord +import qualified Data.Text as T +import qualified Data.Vector as V +import Data.Functor +import GHC.Generics +import Porcupine +import Prelude hiding (id, (.)) +import qualified Control.Foldl as L +import Graphics.Vega.VegaLite as VL +import Control.Monad.Bayes.Class +import Control.Monad.Bayes.Sampler +import Control.Monad.Bayes.Weighted +import Control.Monad.Bayes.Traced +import Numeric.Log + +import Plotting -- In the same folder + + +data RadonObservation = RadonObservation + { state :: !T.Text + , county :: !T.Text + , basement :: !T.Text + , log_radon :: !Double } + deriving (Generic, FromJSON, ToJSON + ,Csv.FromNamedRecord, Csv.ToNamedRecord, Csv.DefaultOrdered) + +-- | We want to read each RadonObservation as a set of Records. This supports +-- reading from CSV files with headers and from JSON files. The Vector cannot +-- directly be read from the CSV, as we would not known whether the columns are +-- positional or nominal. This is why we use the 'Records' wrapper here (for +-- nominal columns). This requires our datatype to instantiate +-- Csv.From/ToNamedRecord +radonObsSerials :: BidirSerials (V.Vector RadonObservation) +radonObsSerials = dimap Records fromRecords $ -- We wrap/unwrap the Records + someBidirSerial (CSVSerial "csv" True ',') + <> + someBidirSerial JSONSerial + +radonObsFile :: DataSource (V.Vector RadonObservation) +radonObsFile = dataSource ["data", "radon"] radonObsSerials + +filteredCsvFile :: DataSink (V.Vector RadonObservation) +filteredCsvFile = dataSink ["debug", "radon-filtered"] radonObsSerials + +vegaliteSerials :: PureSerials VegaLite +vegaliteSerials = + lmap VL.toHtml (somePureSerial $ PlainTextSerial $ Just "html") + <> lmap VL.fromVL (somePureSerial JSONSerial) + +writeViz name = writeData (dataSink ["viz", name] vegaliteSerials) + +data Summary = Summary { numRows :: Int + , uniqueStates :: [T.Text] + , numUniqueCounties :: Int } + deriving (Show) + +foldSummary :: L.Fold RadonObservation Summary +foldSummary = Summary <$> L.length + <*> L.premap state L.nub + <*> (L.premap county L.nub <&> length) + +data ModelParams = ModelParams + { rateWithB :: Double -- ^ ratio of houses with and without basement + , radonWithB :: Double -- ^ radon level in houses with basement + , radonWithoutB :: Double -- ^ radon level in houses without basement + , noiseWithB :: Double -- ^ variation around radonWithB + , noiseWithoutB :: Double -- ^ variation around radonWithoutB + } deriving (Eq, Show, Generic, ToJSON, FromJSON) + +priorModel :: MonadSample m => m ModelParams +priorModel = + ModelParams <$> uniform 0 1 + <*> uniform 0 10 + <*> uniform 0 10 + <*> uniform 0 10 + <*> uniform 0 10 + +likelihood :: ModelParams -> (Bool, Double) -> Log Double +likelihood params (hasBasement, radonObserved) = + case hasBasement of + True -> let radonModel = radonWithB params + noiseModel = noiseWithB params + rate = realToFrac $ rateWithB params + in rate * normalPdf radonModel noiseModel radonObserved + False -> let radonModel = radonWithoutB params + noiseModel = noiseWithoutB params + rate = realToFrac $ 1 - rateWithB params + in rate * normalPdf radonModel noiseModel radonObserved + +model :: MonadInfer m => [(Bool, Double)] -> m ModelParams +model observations = do + params <- priorModel + mapM_ (score . likelihood params) observations + return params + +posteriorForward :: MonadSample m => m ModelParams -> m (Bool, Double) +posteriorForward model = do + params <- model + hasBasement <- bernoulli (rateWithB params) + value <- case hasBasement of + True -> normal (radonWithB params) (noiseWithB params) + False -> normal (radonWithoutB params) (noiseWithoutB params) + return (hasBasement, value) + +sampleFlatLinRegModel :: (LogThrow m) => PTask m () () +sampleFlatLinRegModel = proc () -> do + radonObs <- loadData radonObsFile -< () + writeData filteredCsvFile -< radonObs + let (summary,xs,ys) = flip L.fold radonObs $ + (,,) <$> foldSummary + <*> L.premap ((== "Y") . basement) L.list + <*> L.premap log_radon L.list + xLbl = "has basement" + yLbl = "log radon" + logInfo -< show summary + + vizSize <- getOption ["viz", "options"] + (docField @"vizSize" (400,400) "Width & height of visualisations") -< () + writeViz "1" -< plot vizSize + (S $ scatter2 xLbl yLbl (-3,5)) + (Cols [(xLbl, VL.Booleans xs) + ,(yLbl, VL.Numbers ys)]) + nsamples <- getOption ["sampling", "options"] + (docField @"nsamples" 5000 "Number of samples to draw") -< () + samples <- ioTask -< + sampleIOfixed $ prior $ mh nsamples $ model (zip xs ys) + writeViz "2" -< plot vizSize + (H [[density2DPlot "radonWithB" "radonWithoutB" (0,2) (0,2)] + ,[density2DPlot "noiseWithB" "noiseWithoutB" (0,2) (0,2)]]) + (J samples) + + samples <- ioTask -< + sampleIOfixed $ prior $ mh nsamples $ posteriorForward $ model (zip xs ys) + let (xModel, yModel) = unzip samples + writeViz "3" -< plot vizSize + (S $ scatter2 xLbl yLbl (-3,5)) + (Cols [(xLbl, VL.Booleans xModel) + ,(yLbl, VL.Numbers yModel)]) + + +runIn topdir = runPipelineTask + (FullConfig "example-radon" -- Name of the executable (for --help) + "example-radon.yaml" -- Default config file path + topdir -- Default root directory for mappings + ()) + (baseContexts "") + sampleFlatLinRegModel () + +main :: IO () +main = runIn "examples/example-radon" diff --git a/examples/example-radon/Plotting.hs b/examples/example-radon/Plotting.hs new file mode 100644 index 0000000..f4522da --- /dev/null +++ b/examples/example-radon/Plotting.hs @@ -0,0 +1,79 @@ +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE ExistentialQuantification #-} + +module Plotting where + +import Data.Aeson +import Graphics.Vega.VegaLite +import Data.Text (Text, pack) + + +barPlot :: Text -> VLSpec +barPlot xName = + let enc = encoding + . position X [PName xName, PmType Nominal, PAxis [AxGrid True, AxTitle xName]] + . position Y [PName "binnedData", PAggregate Count, PmType Quantitative, PAxis [AxGrid False, AxTitle "count"]] + in asSpec $ [mark Bar [MOpacity 1.0, MColor "#a3c6de"], enc []] + +barPlot2 :: Text -> Text -> VLSpec +barPlot2 xName yName = + let enc = encoding + . position X [PName xName, PmType Nominal, PAxis [AxGrid True, AxTitle xName]] + . position Y [PName yName, PmType Quantitative, PAxis [AxGrid False, AxTitle yName]] + in asSpec $ [mark Bar [MOpacity 1.0, MColor "#a3c6de"], enc []] + +linePlot :: Text -> Text -> VLSpec +linePlot xName yName = + let enc = encoding + . position X [PName xName, PmType Quantitative, PAxis [AxGrid True, AxTitle xName]] + . position Y [PName yName, PmType Quantitative, PAxis [AxGrid False, AxTitle yName]] + in asSpec $ [mark Line [MColor "green"], enc []] + +density2DPlot :: Text -> Text -> (Double, Double) -> (Double, Double) -> VLSpec +density2DPlot xName yName (xmin, xmax) (ymin, ymax) = + let enc = encoding + . position X [PName xName + ,PScale [SDomain (DNumbers [xmin, xmax])] + ,PBin [Step 0.1], PmType Quantitative, PAxis [AxGrid True, AxTitle xName]] + . position Y [PName yName + ,PScale [SDomain (DNumbers [ymin, ymax])] + ,PBin [Step 0.1] + ,PmType Quantitative + ,PAxis [AxGrid True, AxTitle yName]] + . color [ MAggregate Count, MName "col", MmType Quantitative + , MScale [{-SReverse False,-} SScheme "blues" [0.0, 1.0]]] + in asSpec $ [mark Rect [MClip True], enc []] + +scatter2 :: Text -> Text -> (Double, Double) -> VLSpec +scatter2 xName yName (ymin, ymax) = + let enc = encoding + . position X [PName xName, PmType Nominal, PAxis [AxGrid True, AxTitle xName]] + . position Y [PName yName + , PScale [SDomain (DNumbers [ymin, ymax])] + , PmType Quantitative + , PAxis [AxGrid True, AxTitle yName]] + in asSpec $ [mark Tick [MClip True], enc []] + +data SpecGrid = H [[VLSpec]] | V [[VLSpec]] | L [VLSpec] | S VLSpec + +data InputData = Cols [(Text, DataValues)] + | forall j. (ToJSON j) => J j + | File FilePath + +plot :: (Double, Double) -> SpecGrid -> InputData -> VegaLite +plot (figw,figh) gridOfLayers dat = + let desc = description "Plot" + dat' = case dat of + Cols cols -> foldl (.) (dataFromColumns []) (map (uncurry dataColumn) cols) [] + J o -> dataFromJson (toJSON o) [] + File fp -> dataFromSource (pack fp) [] + conf = configure + -- . configuration (Axis [ DomainWidth 1 ]) + . configuration (SelectionStyle [ ( Single, [ On "dblclick" ] ) ]) + . configuration (View [ViewStroke (Just "transparent")]) + spec = case gridOfLayers of + S l -> layer [l] + L ls -> layer ls + H lss -> hConcat (map (asSpec . (:[]) . layer) lss) + V lss -> vConcat (map (asSpec . (:[]) . layer) lss) + in toVegaLite [width figw, height figh, conf [], desc, dat', spec] diff --git a/examples/example0.1/Example0_1.hs b/examples/example0.1/Example0_1.hs new file mode 100644 index 0000000..517921f --- /dev/null +++ b/examples/example0.1/Example0_1.hs @@ -0,0 +1,36 @@ +{-# LANGUAGE DataKinds #-} +{-# LANGUAGE TypeApplications #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE GADTs #-} +{-# LANGUAGE Arrows #-} + +import Data.DocRecord +import qualified Data.Text.Lazy as T +import Porcupine +import Prelude hiding (id, (.)) + + +yzCompress :: T.Text -> T.Text +yzCompress = T.concat . map counts . T.group + where + counts s = T.pack (show (T.length s)) <> T.take 1 s <> "," + +resultFile :: DataSink T.Text +resultFile = dataSink ["result"] $ + somePureSerial (PlainTextSerial (Just "txt")) + <> lmap yzCompress + (somePureSerial (PlainTextSerial (Just "yz"))) + +myTask :: (LogThrow m) => PTask m () () +myTask = proc () -> do + (FV chars :& FV nums :& _) <- + getOptions ["options"] + ( docField @"chars" "a" "The chars to repeat" + :& docField @"replications" [10::Int] "The numbers of replications" + :& RNil) -< () + let txt = T.concat $ + zipWith (\s n -> T.replicate (fromIntegral n) (T.singleton s)) chars nums + writeData resultFile -< txt + +main :: IO () +main = runLocalPipelineTask (FullConfig "example0.1" "example0_1.yaml" "." ()) myTask () diff --git a/examples/example0/Example0.hs b/examples/example0/Example0.hs new file mode 100644 index 0000000..a8e74c3 --- /dev/null +++ b/examples/example0/Example0.hs @@ -0,0 +1,30 @@ +{-# LANGUAGE DataKinds #-} +{-# LANGUAGE TypeApplications #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE GADTs #-} +{-# LANGUAGE Arrows #-} + +import Data.DocRecord +import qualified Data.Text.Lazy as T +import Porcupine + + +resultFile :: DataSink T.Text +resultFile = dataSink ["result"] $ + somePureSerial (PlainTextSerial (Just "txt")) + +myTask :: (LogThrow m) => PTask m () () +myTask = proc () -> do + (FV char :& FV num :& _) <- getMyOptions -< () + let txt = T.replicate (fromIntegral num) (T.singleton char) + writeData resultFile -< txt + where + getMyOptions = + getOptions ["options"] + ( docField @"char" 'a' "The character to repeat" + :& docField @"replications" (10::Int) "The number of replications" + :& RNil) + +main :: IO () +-- main = simpleRunPTask myTask () +main = runLocalPipelineTask (FullConfig "example0" "example0.yaml" "." ()) myTask () diff --git a/examples/example1/Example1.hs b/examples/example1/Example1.hs new file mode 100644 index 0000000..b81df38 --- /dev/null +++ b/examples/example1/Example1.hs @@ -0,0 +1,75 @@ +{-# LANGUAGE DataKinds #-} +{-# LANGUAGE DeriveGeneric #-} +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE TypeApplications #-} + +import Data.Aeson +import Data.DocRecord +import qualified Data.HashMap.Strict as HM +import qualified Data.Text as T +import GHC.Generics +import Porcupine + + +data User = User { userName :: T.Text + , userSurname :: T.Text + , userAge :: Int } + deriving (Generic) +instance FromJSON User + +newtype Analysis = Analysis { numLetters :: HM.HashMap Char Int } + deriving (Generic) +instance ToJSON Analysis + +-- | How to load users +userFile :: DataSource User +userFile = dataSource ["Inputs", "User"] + (somePureDeserial JSONSerial) + +-- | How to write analysis +analysisFile :: DataSink Analysis +analysisFile = dataSink ["Outputs", "Analysis"] + (somePureSerial JSONSerial) + +-- | The simple computation we want to perform +computeAnalysis :: User -> Analysis +computeAnalysis (User name surname _) = Analysis $ + HM.fromListWith (+) $ [(c,1) | c <- T.unpack name] + ++ [(c,1) | c <- T.unpack surname] + +-- | The task combining the three previous operations. +-- +-- This task may look very opaque from the outside, having no parameters and no +-- return value. But we will be able to reuse it over different users without +-- having to change it at all. +analyseOneUser :: (LogThrow m) => PTask m () () +analyseOneUser = + loadData userFile >>> arr computeAnalysis >>> writeData analysisFile + +mainTask :: (LogThrow m) => PTask m () () +mainTask = + -- First we get the ids of the users that we want to analyse. We need only one + -- field that will contain a range of values, see IndexRange. By default, this + -- range contains just one value, zero. + getOption ["Settings"] (docField @"users" (oneIndex (0::Int)) "The user ids to load") + -- We turn the range we read into a full lazy list: + >>> arr enumTRIndices + -- Then we just map over these ids and call analyseOneUser each time: + >>> parMapTask_ "userId" analyseOneUser + +main :: IO () +main = runPipelineTask (FullConfig "example1" "porcupine-example1.yaml" "porcupine-core/examples/example1/data" ()) + -- The CLI/Yaml configuration to use (prog name, + -- default config file to create, and default root to + -- use for the porcupine tree) + (baseContexts "") + -- The contexts to use. 'baseContexts' is the + -- minimum. It gives out katip logging and local files + -- access (through ResourceT). The string param is the + -- top namespace for the logger. When we use + -- FullConfig (and therefore CLI), the progName for + -- the CLI given above ("example1") will be inherited + -- by the logger, so we can leave it blank + mainTask () diff --git a/examples/example2/Example2.hs b/examples/example2/Example2.hs new file mode 100644 index 0000000..e9e043c --- /dev/null +++ b/examples/example2/Example2.hs @@ -0,0 +1,78 @@ +{-# LANGUAGE DataKinds #-} +{-# LANGUAGE DeriveGeneric #-} +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE TypeApplications #-} + +import Data.Aeson +import qualified Data.Text as T +import GHC.Generics +import Porcupine + +import Prelude hiding (id, (.)) + + +-- This example uses the porcupine to read a data that represents the evloution of a given stock in given data and +-- gives back the average and standard deviation of the stock on that date. + +data Stockdaily = Stockdaily {date :: String , close :: Double} + deriving (Generic) +instance FromJSON Stockdaily + +newtype Stock = Stock { chart :: [Stockdaily] } + deriving (Generic) +instance FromJSON Stock + +getCloseStock :: Stock -> [Double] +getCloseStock s = map close (chart s) + +-- | How to load Stock prices +stockFile :: DataSource Stock +stockFile = dataSource ["Inputs", "Stock"] + (somePureDeserial JSONSerial) + +-- | How to write the smoothed stock prices +globalMatrix :: DataSink (Tabular [[Double]]) +globalMatrix = dataSink ["Outputs" , "globalData"] + (somePureSerial (CSVSerial (T.pack "csv") False ',')) + +avg :: [Double] -> Double +avg list = let s = sum list + n = fromIntegral (length list) + in s/n + +msliding :: Int -> [a] -> [[a]] +msliding n p = case p of + [] -> [] + (_:xs) -> [take n p] ++ (msliding n xs) + +-- | The simple computation we want to perform +computeSmoothedCurve :: Stock -> [Double] +computeSmoothedCurve s = curve + where + price = getCloseStock s + curve = map avg (msliding 10 price) + +analyseStocks :: (LogThrow m) => PTask m () () +analyseStocks = + arr (const ["aapl"::TRIndex, "fb" , "googl"]) -- We want the stocks for some + -- fixed set of companies + >>> loadDataList "company" stockFile + >>> arr (Tabular Nothing . map (\(_idx,stock) -> computeSmoothedCurve stock)) + >>> writeData globalMatrix + +main :: IO () +main = runPipelineTask (FullConfig "example2" "porcupine-example2.yaml" "porcupine-core/examples/example2/data" ()) + -- The CLI/Yaml configuration to use (prog name, + -- default config file to create, and default root to + -- use for the porcupine tree) + (baseContexts "") + -- The contexts to use. 'baseContexts' is the + -- minimum. It gives out katip logging and local files + -- access (through ResourceT). The string param is the + -- top namespace for the logger. When we use + -- FullConfig (and therefore CLI), the progName for + -- the CLI given above ("example1") will be inherited + -- by the logger, so we can leave it blank + analyseStocks () |