summaryrefslogtreecommitdiff
path: root/src/Index.hs
blob: 5486692438c9acbcf86cdd5c0d86a6a2dd604d33 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
module Index (
  Index,
  initIndex,
  indexGetEventsLinear,
  indexGetEventsDay,
  indexNumEvents,
  indexCalendar,
) where

import Data.Time.Calendar
import Control.Monad (forM)
import Data.ByteString qualified as BS
import Data.ByteString (ByteString)
import Data.Char (isDigit)
import Data.List (sort, scanl')
import Data.Map.Strict (Map)
import Data.Map.Strict qualified as Map
import Data.Text qualified as T
import Data.Vector.Storable qualified as VS
import Data.Word (Word32)
import System.Clock qualified as Clock
import System.Directory
import System.FilePath
import Text.Read (readMaybe)

import Cache
import Config (Channel(..))
import Mmap
import Util
import ZNC


data ChanIndex = ChanIndex
  { ciStartDay :: !Day
  , ciEndDay :: !Day  -- ^ inclusive
  , ciCountUntil :: !(VS.Vector Int)  -- ^ number of events up to and /including/ this day
  , ciTotal :: !Int }
  deriving (Show)

data Index = Index !FilePath
                   !(Map Channel ChanIndex)
                   !(Cache (Channel, YMD) (ByteString, VS.Vector Word32))

-- init

initIndex :: FilePath -> [Channel] -> IO Index
initIndex basedir toimport = do
  cache <- cacheNew 100

  c_start <- Clock.getTime Clock.Realtime
  items <-
    fmap concat . forM (map chanNetwork toimport) $ \nwT -> do
      let nw = T.unpack nwT
      forM [ch | Channel nwT' ch <- toimport, nwT' == nwT] $ \chT -> do
        let ch = T.unpack chT
        files <- listDirectory (basedir </> nw </> ch)
        days <- fmap sort . forM files $ \fn -> do
          let path = basedir </> nw </> ch </> fn
          -- putStrLn $ "Parsing " ++ path ++ " (" ++ show (parseFileName fn) ++ " -> " ++ show (dateToDay (parseFileName fn)) ++ ")"
          events <- parseLog <$> BS.readFile path
          return (uncurry3 fromGregorian (parseFileName fn), length events)
        let minday = minimum (map fst days)
            maxday = maximum (map fst days)
            ndays = fromIntegral @Integer @Int (diffDays maxday minday + 1)
        -- traceM $ nw ++ "/" ++ ch ++ ": days = " ++ show [(toFileName (dayToYMD d), i) | (d, i) <- days]
        let countScan = VS.fromListN ndays (drop 1 $ scanl' (+) 0 (makeCounts [minday..maxday] days))
        let ntotal = sum (map snd days)
        return (Channel nwT chT
               ,ChanIndex
                  { ciStartDay = minday
                  , ciEndDay = maxday
                  , ciCountUntil = countScan
                  , ciTotal = ntotal})
  c_end <- Clock.getTime Clock.Realtime
  let timetakenSecs = fromIntegral @_ @Double (Clock.toNanoSecs (Clock.diffTimeSpec c_start c_end)) / 1e9
  putStrLn $ "Parsing/indexing logs in " ++ show basedir ++ " took " ++ show timetakenSecs ++ " secs"

  return (Index basedir (Map.fromList items) cache)

makeCounts :: [Day] -> [(Day, Int)] -> [Int]
makeCounts [] [] = []
makeCounts [] _ = error "makeCounts: more entries than days in range"
makeCounts (d:ds) ents@((d',n):ents')
  | d == d' = n : makeCounts ds ents'
  | d < d'  = 0 : makeCounts ds ents
  | otherwise = error $ "makeCounts: duplicate entry? " ++ show (d, toGregorian d, d', toGregorian d')
makeCounts (_:ds) [] = 0 : makeCounts ds []

-- search

-- | Returns proper lazy list of events. Reading the files happens strictly,
-- but parsing the events happens lazily.
indexGetEventsLinear :: Index -> Channel -> Int -> Int -> IO [(YMDHMS, Event)]
indexGetEventsLinear index@(Index _ mp _) chan from count
  | from + count < 0 = return []
  | from >= ciTotal ci = return []
  | otherwise = do
      let scan = ciCountUntil ci
          day1idx = binSearch scan from
          day1 = day1idx `addDays'` ciStartDay ci
          neventBeforeDay1 | day1idx == 0 = 0
                           | otherwise = scan VS.! (day1idx - 1)
          neventInclDay1 = scan VS.! day1idx
          neventOnDay1 = neventInclDay1 - neventBeforeDay1
          off1 = from - neventBeforeDay1
          -- day2 is inclusive, off2 is exclusive
          (day2, off2)
            | from + count <= neventInclDay1 = (day1, off1 + count)
            | day1idx == VS.length scan - 1 = (day1, neventOnDay1)
            | otherwise =
                let loop day2idx nbefore nseen 
                      | nseen + nOnDay2 >= count =
                          (day2idx `addDays'` ciStartDay ci, count - nseen)
                      | day2idx == VS.length scan - 1 =
                          (day2idx `addDays'` ciStartDay ci, nOnDay2)
                      | otherwise =
                          loop (day2idx + 1) (scan VS.! day2idx) (nseen + nOnDay2)
                      where
                        nOnDay2 = scan VS.! day2idx - nbefore
                in loop (day1idx + 1) (neventBeforeDay1 + neventOnDay1) (neventOnDay1 - off1)

      -- traceM ("ci = " ++ show ci)
      -- traceM ("binSearch " ++ show from ++ " =")
      -- traceM ("  " ++ show day1idx)
      -- traceM ("day1 = " ++ show day1)
      -- traceM ("off1 = " ++ show off1)
      -- traceM ("neventOnDay1 = " ++ show neventOnDay1)
      -- traceM ("count = " ++ show count)
      -- traceM ("day2 = " ++ show day2)
      -- traceM ("off2 = " ++ show off2)
      evs <- forM (zip [day1 .. day2] [day1idx..]) $ \(day, dayidx) ->
               let neventsOnDay | dayidx == 0 = scan VS.! 0
                                | otherwise = scan VS.! dayidx - scan VS.! (dayidx - 1)
                   parse
                     | day == day1 =
                         if day1 == day2
                           then parseLogRange (off1, Just (off2 - off1))
                           else parseLogRange (off1, Nothing)
                     | day == day2 = parseLogRange (0, Just off2)
                     | otherwise = \_lineStarts -> parseLog
                   (y, month, d) = toGregorian day
                   ymd = YMD (fromIntegral y) (fromIntegral month) (fromIntegral d)
                   fixDate = map $ \(tod, ev) -> (YMDHMS ymd tod, ev)
               in if neventsOnDay > 0
                    then do (bs, lineStarts) <- loadDay index chan ymd
                            return (fixDate (parse lineStarts bs))
                    else return []

      return (concat evs)
  where
    ci = mp Map.! chan

-- | The vector must be sorted.
-- Returns index of the first element x such that needle < x. If there is no
-- such element (i.e. needle is greater-equal the last element of vec), returns
-- the length of vec.
--
-- TODO: proportional binary search
binSearch :: VS.Vector Int -> Int -> Int
binSearch vec needle
  | veclen == 0 || vec VS.! (veclen - 1) < needle = veclen
  | needle < vec VS.! 0 = 0
  | otherwise = go 0 (veclen - 1)
  where
    veclen = VS.length vec

    -- Invariant: vec[lo] <= needle < vec[hi]
    go :: Int -> Int -> Int
    go lo hi
      | lo + 1 == hi = hi
      | otherwise =
          let mid = lo + (hi - lo) `div` 2
          in if vec VS.! mid <= needle
               then go mid hi else go lo mid

-- other methods

indexNumEvents :: Index -> Channel -> Int
indexNumEvents (Index _ mp _) chan = ciTotal (mp Map.! chan)

indexCalendar :: Index -> Channel -> ((Day, Day), [Int])
indexCalendar (Index _ mp _) chan =
  let ci = mp Map.! chan
  in ((ciStartDay ci, ciEndDay ci)
     ,[if i == 0
         then ciCountUntil ci VS.! 0
         else ciCountUntil ci VS.! i - ciCountUntil ci VS.! (i - 1)
      | i <- [0 .. VS.length (ciCountUntil ci) - 1]])

indexGetEventsDay :: Index -> Channel -> Day -> IO [(HMS, Event)]
indexGetEventsDay index@(Index _ mp _) chan day
  | day < ciStartDay ci || day > ciEndDay ci = return []
  | otherwise = do
      (bs, _lineStarts) <- loadDay index chan (dayToYMD day)
      return (parseLog bs)
  where
    ci = mp Map.! chan

-- utilities

loadDay :: Index -> Channel -> YMD -> IO (ByteString, VS.Vector Word32)
loadDay (Index basedir _ cache) chan@(Channel network channel) ymd = do
  cacheLookup cache (chan, ymd) >>= \case
    Nothing -> do
      bs <- mapFile (basedir </> T.unpack network </> T.unpack channel </> toFileName ymd)
      let lineStarts = preparseLog bs
      cacheAdd cache (chan, ymd) (bs, lineStarts)
      return (bs, lineStarts)
    Just (bs, lineStarts) -> return (bs, lineStarts)

parseFileName :: String -> (Year, MonthOfYear, DayOfMonth)
parseFileName name
  | (y, '-' : s1) <- splitAt 4 name
  , (m, '-' : s2) <- splitAt 2 s1
  , (d, ".log") <- splitAt 2 s2
  , all isDigit y, all isDigit m, all isDigit d
  , let y' = read' y ; m' = read' m ; d' = read' d
  , 1 <= m', m' <= 12
  , 1 <= d', d' <= gregorianMonthLength y' m'
  = (y', m', d')
  | otherwise
  = error $ "Invalid ZNC log file name: " ++ name
  where
    read' :: Read a => String -> a
    read' s = case readMaybe s of
                Just r -> r
                Nothing -> error $ "No read: " ++ show s

toFileName :: YMD -> String
toFileName ymd = ymdToString ymd ++ ".log"

addDays' :: Int -> Day -> Day
addDays' = addDays . fromIntegral