From f21dcde54b09913550036e6501cca935278597d9 Mon Sep 17 00:00:00 2001 From: Tom Smeding Date: Sun, 29 Mar 2026 23:25:10 +0200 Subject: Initial --- src/Index.hs | 193 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 193 insertions(+) create mode 100644 src/Index.hs (limited to 'src/Index.hs') diff --git a/src/Index.hs b/src/Index.hs new file mode 100644 index 0000000..4de4c6c --- /dev/null +++ b/src/Index.hs @@ -0,0 +1,193 @@ +module Index ( + Index, + initIndex, + indexGetEventsLinear, + indexNumEvents, +) where + +import Chronos hiding (day) +import Control.Monad (forM) +import Data.ByteString qualified as BS +import Data.Char (isDigit) +import Data.List (sort, scanl') +import Data.Map.Strict (Map) +import Data.Map.Strict qualified as Map +import Data.Text qualified as T +import Data.Vector.Storable qualified as VS +import System.Directory +import System.FilePath +import Text.Read (readMaybe) +import Torsor (add, difference) + +import Debug.Trace + +import Config (Channel(..)) +import Mmap +import Util +import ZNC + + +data ChanIndex = ChanIndex + { ciStartDay :: Day + , ciEndDay :: Day + , ciCountUntil :: VS.Vector Int -- ^ number of events up to and /including/ this day + , ciTotal :: Int } + deriving (Show) + +data Index = Index FilePath (Map Channel ChanIndex) + deriving (Show) + +-- init + +initIndex :: FilePath -> [Channel] -> IO Index +initIndex basedir toimport = do + items <- + fmap concat . forM (map chanNetwork toimport) $ \nwT -> do + let nw = T.unpack nwT + forM [ch | Channel nwT' ch <- toimport, nwT' == nwT] $ \chT -> do + let ch = T.unpack chT + files <- listDirectory (basedir nw ch) + days <- fmap sort . forM files $ \fn -> do + let path = basedir nw ch fn + -- putStrLn $ "Parsing " ++ path ++ " (" ++ show (parseFileName fn) ++ " -> " ++ show (dateToDay (parseFileName fn)) ++ ")" + events <- parseLog <$> BS.readFile path + return (dateToDay (parseFileName fn), length events) + let minday = minimum (map fst days) + maxday = maximum (map fst days) + ndays = difference maxday minday + 1 + -- traceM $ nw ++ "/" ++ ch ++ ": days = " ++ show [(toFileName (dayToDate d), i) | (d, i) <- days] + let countScan = VS.fromListN ndays (drop 1 $ scanl' (+) 0 (makeCounts [minday..maxday] days)) + let ntotal = sum (map snd days) + return (Channel nwT chT + ,ChanIndex + { ciStartDay = minday + , ciEndDay = maxday + , ciCountUntil = countScan + , ciTotal = ntotal}) + return (Index basedir (Map.fromList items)) + +makeCounts :: [Day] -> [(Day, Int)] -> [Int] +makeCounts [] [] = [] +makeCounts [] _ = error "makeCounts: more entries than days in range" +makeCounts (d:ds) ents@((d',n):ents') + | d == d' = n : makeCounts ds ents' + | d < d' = 0 : makeCounts ds ents + | otherwise = error $ "makeCounts: duplicate entry? " ++ show (d, dayToDate d, d', dayToDate d') +makeCounts (_:ds) [] = 0 : makeCounts ds [] + +-- search + +-- | Returns proper lazy list of events. Reading the files happens strictly, +-- but parsing the events happens lazily. +indexGetEventsLinear :: Index -> Channel -> Int -> Int -> IO [(YMDHMS, Event)] +indexGetEventsLinear (Index basedir mp) chan@(Channel network channel) from count + | from + count < 0 = return [] + | from >= ciTotal ci = return [] + | otherwise = do + let scan = ciCountUntil ci + day1idx = binSearch scan from + day1 = day1idx `add` ciStartDay ci + neventBeforeDay1 | day1idx == 0 = 0 + | otherwise = scan VS.! (day1idx - 1) + neventInclDay1 = scan VS.! day1idx + neventOnDay1 = neventInclDay1 - neventBeforeDay1 + off1 = from - neventBeforeDay1 + -- day2 is inclusive, off2 is exclusive + (day2, off2) + | from + count <= neventInclDay1 = (day1, off1 + count) + | day1idx == VS.length scan - 1 = (day1, neventOnDay1) + | otherwise = + let loop day2idx nbefore nseen + | nseen + nOnDay2 >= count = + (day2idx `add` ciStartDay ci, count - nseen) + | day2idx == VS.length scan - 1 = + (day2idx `add` ciStartDay ci, nOnDay2) + | otherwise = + loop (day2idx + 1) (scan VS.! day2idx) (nseen + nOnDay2) + where + nOnDay2 = scan VS.! day2idx - nbefore + in loop (day1idx + 1) (neventBeforeDay1 + neventOnDay1) (neventOnDay1 - off1) + + -- traceM ("ci = " ++ show ci) + -- traceM ("binSearch " ++ show from ++ " =") + -- traceM (" " ++ show day1idx) + -- traceM ("day1 = " ++ show day1) + -- traceM ("off1 = " ++ show off1) + -- traceM ("neventOnDay1 = " ++ show neventOnDay1) + -- traceM ("count = " ++ show count) + -- traceM ("day2 = " ++ show day2) + -- traceM ("off2 = " ++ show off2) + evs <- forM (zip [day1 .. day2] [day1idx..]) $ \(day, dayidx) -> + let neventsOnDay | dayidx == 0 = scan VS.! 0 + | otherwise = scan VS.! dayidx - scan VS.! (dayidx - 1) + parse + | day == day1 = + if day1 == day2 + then parseLogRange (off1, Just (off2 - off1)) + else parseLogRange (off1, Nothing) + | day == day2 = parseLogRange (0, Just off2) + | otherwise = parseLog + Date (Year y) (Month monthm1) (DayOfMonth d) = dayToDate day + fixDate = map $ \(tod, ev) -> + (YMDHMS (YMD y (fromIntegral monthm1 + 1) (fromIntegral d)) tod + ,ev) + in if neventsOnDay > 0 + then fixDate . parse <$> + mapFile (basedir T.unpack network T.unpack channel toFileName (dayToDate day)) + else return [] + + return (concat evs) + where + ci = mp Map.! chan + +-- | The vector must be sorted. +-- Returns index of the first element x such that needle < x. If there is no +-- such element (i.e. needle is greater-equal the last element of vec), returns +-- the length of vec. +-- +-- TODO: proportional binary search +binSearch :: VS.Vector Int -> Int -> Int +binSearch vec needle + | veclen == 0 || vec VS.! (veclen - 1) < needle = veclen + | needle < vec VS.! 0 = 0 + | otherwise = go 0 (veclen - 1) + where + veclen = VS.length vec + + -- Invariant: vec[lo] <= needle < vec[hi] + go :: Int -> Int -> Int + go lo hi + | lo + 1 == hi = hi + | otherwise = + let mid = lo + (hi - lo) `div` 2 + in if vec VS.! mid <= needle + then go mid hi else go lo mid + +-- other methods + +indexNumEvents :: Index -> Channel -> Int +indexNumEvents (Index _ mp) chan = ciTotal (mp Map.! chan) + +-- utilities + +parseFileName :: String -> Date +parseFileName name + | (y, '-' : s1) <- splitAt 4 name + , (m, '-' : s2) <- splitAt 2 s1 + , (d, ".log") <- splitAt 2 s2 + , all isDigit y, all isDigit m, all isDigit d + , let y' = read' y ; m' = read' m - 1 ; d' = read' d + , 0 <= m', m' < 12 + , 1 <= d', d' <= daysInMonth (isLeapYear (Year y')) (Month m') + = Date (Year y') (Month m') (DayOfMonth d') + | otherwise + = error $ "Invalid ZNC log file name: " ++ name + where + read' :: Read a => String -> a + read' s = case readMaybe s of + Just r -> r + Nothing -> error $ "No read: " ++ show s + +toFileName :: Date -> String +toFileName (Date (Year y) (Month m) (DayOfMonth d)) = + pad '0' 4 y ++ '-' : pad '0' 2 (m + 1) ++ '-' : pad '0' 2 d ++ ".log" -- cgit v1.3