1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
|
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TypeApplications #-}
import Data.Aeson
import Data.DocRecord
import qualified Data.HashMap.Strict as HM
import qualified Data.Text as T
import GHC.Generics
import Porcupine
data User = User { userName :: T.Text
, userSurname :: T.Text
, userAge :: Int }
deriving (Generic)
instance FromJSON User
newtype Analysis = Analysis { numLetters :: HM.HashMap Char Int }
deriving (Generic)
instance ToJSON Analysis
-- | How to load users
userFile :: DataSource User
userFile = dataSource ["Inputs", "User"]
(somePureDeserial JSONSerial)
-- | How to write analysis
analysisFile :: DataSink Analysis
analysisFile = dataSink ["Outputs", "Analysis"]
(somePureSerial JSONSerial)
-- | The simple computation we want to perform
computeAnalysis :: User -> Analysis
computeAnalysis (User name surname _) = Analysis $
HM.fromListWith (+) $ [(c,1) | c <- T.unpack name]
++ [(c,1) | c <- T.unpack surname]
-- | The task combining the three previous operations.
--
-- This task may look very opaque from the outside, having no parameters and no
-- return value. But we will be able to reuse it over different users without
-- having to change it at all.
analyseOneUser :: (LogThrow m) => PTask m () ()
analyseOneUser =
loadData userFile >>> arr computeAnalysis >>> writeData analysisFile
mainTask :: (LogThrow m) => PTask m () ()
mainTask =
-- First we get the ids of the users that we want to analyse. We need only one
-- field that will contain a range of values, see IndexRange. By default, this
-- range contains just one value, zero.
getOption ["Settings"] (docField @"users" (oneIndex (0::Int)) "The user ids to load")
-- We turn the range we read into a full lazy list:
>>> arr enumTRIndices
-- Then we just map over these ids and call analyseOneUser each time:
>>> parMapTask_ "userId" analyseOneUser
main :: IO ()
main = runPipelineTask (FullConfig "example1" "porcupine-example1.yaml" "porcupine-core/examples/example1/data" ())
-- The CLI/Yaml configuration to use (prog name,
-- default config file to create, and default root to
-- use for the porcupine tree)
(baseContexts "")
-- The contexts to use. 'baseContexts' is the
-- minimum. It gives out katip logging and local files
-- access (through ResourceT). The string param is the
-- top namespace for the logger. When we use
-- FullConfig (and therefore CLI), the progName for
-- the CLI given above ("example1") will be inherited
-- by the logger, so we can leave it blank
mainTask ()
|