summaryrefslogtreecommitdiff
path: root/src/Index.hs
diff options
context:
space:
mode:
authorTom Smeding <tom@tomsmeding.com>2026-03-29 23:25:10 +0200
committerTom Smeding <tom@tomsmeding.com>2026-03-29 23:25:10 +0200
commitf21dcde54b09913550036e6501cca935278597d9 (patch)
tree505f373b1bce690f0bafc2038636721126d9bcad /src/Index.hs
Initial
Diffstat (limited to 'src/Index.hs')
-rw-r--r--src/Index.hs193
1 files changed, 193 insertions, 0 deletions
diff --git a/src/Index.hs b/src/Index.hs
new file mode 100644
index 0000000..4de4c6c
--- /dev/null
+++ b/src/Index.hs
@@ -0,0 +1,193 @@
+module Index (
+ Index,
+ initIndex,
+ indexGetEventsLinear,
+ indexNumEvents,
+) where
+
+import Chronos hiding (day)
+import Control.Monad (forM)
+import Data.ByteString qualified as BS
+import Data.Char (isDigit)
+import Data.List (sort, scanl')
+import Data.Map.Strict (Map)
+import Data.Map.Strict qualified as Map
+import Data.Text qualified as T
+import Data.Vector.Storable qualified as VS
+import System.Directory
+import System.FilePath
+import Text.Read (readMaybe)
+import Torsor (add, difference)
+
+import Debug.Trace
+
+import Config (Channel(..))
+import Mmap
+import Util
+import ZNC
+
+
+data ChanIndex = ChanIndex
+ { ciStartDay :: Day
+ , ciEndDay :: Day
+ , ciCountUntil :: VS.Vector Int -- ^ number of events up to and /including/ this day
+ , ciTotal :: Int }
+ deriving (Show)
+
+data Index = Index FilePath (Map Channel ChanIndex)
+ deriving (Show)
+
+-- init
+
+initIndex :: FilePath -> [Channel] -> IO Index
+initIndex basedir toimport = do
+ items <-
+ fmap concat . forM (map chanNetwork toimport) $ \nwT -> do
+ let nw = T.unpack nwT
+ forM [ch | Channel nwT' ch <- toimport, nwT' == nwT] $ \chT -> do
+ let ch = T.unpack chT
+ files <- listDirectory (basedir </> nw </> ch)
+ days <- fmap sort . forM files $ \fn -> do
+ let path = basedir </> nw </> ch </> fn
+ -- putStrLn $ "Parsing " ++ path ++ " (" ++ show (parseFileName fn) ++ " -> " ++ show (dateToDay (parseFileName fn)) ++ ")"
+ events <- parseLog <$> BS.readFile path
+ return (dateToDay (parseFileName fn), length events)
+ let minday = minimum (map fst days)
+ maxday = maximum (map fst days)
+ ndays = difference maxday minday + 1
+ -- traceM $ nw ++ "/" ++ ch ++ ": days = " ++ show [(toFileName (dayToDate d), i) | (d, i) <- days]
+ let countScan = VS.fromListN ndays (drop 1 $ scanl' (+) 0 (makeCounts [minday..maxday] days))
+ let ntotal = sum (map snd days)
+ return (Channel nwT chT
+ ,ChanIndex
+ { ciStartDay = minday
+ , ciEndDay = maxday
+ , ciCountUntil = countScan
+ , ciTotal = ntotal})
+ return (Index basedir (Map.fromList items))
+
+makeCounts :: [Day] -> [(Day, Int)] -> [Int]
+makeCounts [] [] = []
+makeCounts [] _ = error "makeCounts: more entries than days in range"
+makeCounts (d:ds) ents@((d',n):ents')
+ | d == d' = n : makeCounts ds ents'
+ | d < d' = 0 : makeCounts ds ents
+ | otherwise = error $ "makeCounts: duplicate entry? " ++ show (d, dayToDate d, d', dayToDate d')
+makeCounts (_:ds) [] = 0 : makeCounts ds []
+
+-- search
+
+-- | Returns proper lazy list of events. Reading the files happens strictly,
+-- but parsing the events happens lazily.
+indexGetEventsLinear :: Index -> Channel -> Int -> Int -> IO [(YMDHMS, Event)]
+indexGetEventsLinear (Index basedir mp) chan@(Channel network channel) from count
+ | from + count < 0 = return []
+ | from >= ciTotal ci = return []
+ | otherwise = do
+ let scan = ciCountUntil ci
+ day1idx = binSearch scan from
+ day1 = day1idx `add` ciStartDay ci
+ neventBeforeDay1 | day1idx == 0 = 0
+ | otherwise = scan VS.! (day1idx - 1)
+ neventInclDay1 = scan VS.! day1idx
+ neventOnDay1 = neventInclDay1 - neventBeforeDay1
+ off1 = from - neventBeforeDay1
+ -- day2 is inclusive, off2 is exclusive
+ (day2, off2)
+ | from + count <= neventInclDay1 = (day1, off1 + count)
+ | day1idx == VS.length scan - 1 = (day1, neventOnDay1)
+ | otherwise =
+ let loop day2idx nbefore nseen
+ | nseen + nOnDay2 >= count =
+ (day2idx `add` ciStartDay ci, count - nseen)
+ | day2idx == VS.length scan - 1 =
+ (day2idx `add` ciStartDay ci, nOnDay2)
+ | otherwise =
+ loop (day2idx + 1) (scan VS.! day2idx) (nseen + nOnDay2)
+ where
+ nOnDay2 = scan VS.! day2idx - nbefore
+ in loop (day1idx + 1) (neventBeforeDay1 + neventOnDay1) (neventOnDay1 - off1)
+
+ -- traceM ("ci = " ++ show ci)
+ -- traceM ("binSearch " ++ show from ++ " =")
+ -- traceM (" " ++ show day1idx)
+ -- traceM ("day1 = " ++ show day1)
+ -- traceM ("off1 = " ++ show off1)
+ -- traceM ("neventOnDay1 = " ++ show neventOnDay1)
+ -- traceM ("count = " ++ show count)
+ -- traceM ("day2 = " ++ show day2)
+ -- traceM ("off2 = " ++ show off2)
+ evs <- forM (zip [day1 .. day2] [day1idx..]) $ \(day, dayidx) ->
+ let neventsOnDay | dayidx == 0 = scan VS.! 0
+ | otherwise = scan VS.! dayidx - scan VS.! (dayidx - 1)
+ parse
+ | day == day1 =
+ if day1 == day2
+ then parseLogRange (off1, Just (off2 - off1))
+ else parseLogRange (off1, Nothing)
+ | day == day2 = parseLogRange (0, Just off2)
+ | otherwise = parseLog
+ Date (Year y) (Month monthm1) (DayOfMonth d) = dayToDate day
+ fixDate = map $ \(tod, ev) ->
+ (YMDHMS (YMD y (fromIntegral monthm1 + 1) (fromIntegral d)) tod
+ ,ev)
+ in if neventsOnDay > 0
+ then fixDate . parse <$>
+ mapFile (basedir </> T.unpack network </> T.unpack channel </> toFileName (dayToDate day))
+ else return []
+
+ return (concat evs)
+ where
+ ci = mp Map.! chan
+
+-- | The vector must be sorted.
+-- Returns index of the first element x such that needle < x. If there is no
+-- such element (i.e. needle is greater-equal the last element of vec), returns
+-- the length of vec.
+--
+-- TODO: proportional binary search
+binSearch :: VS.Vector Int -> Int -> Int
+binSearch vec needle
+ | veclen == 0 || vec VS.! (veclen - 1) < needle = veclen
+ | needle < vec VS.! 0 = 0
+ | otherwise = go 0 (veclen - 1)
+ where
+ veclen = VS.length vec
+
+ -- Invariant: vec[lo] <= needle < vec[hi]
+ go :: Int -> Int -> Int
+ go lo hi
+ | lo + 1 == hi = hi
+ | otherwise =
+ let mid = lo + (hi - lo) `div` 2
+ in if vec VS.! mid <= needle
+ then go mid hi else go lo mid
+
+-- other methods
+
+indexNumEvents :: Index -> Channel -> Int
+indexNumEvents (Index _ mp) chan = ciTotal (mp Map.! chan)
+
+-- utilities
+
+parseFileName :: String -> Date
+parseFileName name
+ | (y, '-' : s1) <- splitAt 4 name
+ , (m, '-' : s2) <- splitAt 2 s1
+ , (d, ".log") <- splitAt 2 s2
+ , all isDigit y, all isDigit m, all isDigit d
+ , let y' = read' y ; m' = read' m - 1 ; d' = read' d
+ , 0 <= m', m' < 12
+ , 1 <= d', d' <= daysInMonth (isLeapYear (Year y')) (Month m')
+ = Date (Year y') (Month m') (DayOfMonth d')
+ | otherwise
+ = error $ "Invalid ZNC log file name: " ++ name
+ where
+ read' :: Read a => String -> a
+ read' s = case readMaybe s of
+ Just r -> r
+ Nothing -> error $ "No read: " ++ show s
+
+toFileName :: Date -> String
+toFileName (Date (Year y) (Month m) (DayOfMonth d)) =
+ pad '0' 4 y ++ '-' : pad '0' 2 (m + 1) ++ '-' : pad '0' 2 d ++ ".log"