summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTom Smeding <tom@tomsmeding.com>2026-05-10 22:05:54 +0200
committerTom Smeding <tom@tomsmeding.com>2026-05-10 22:05:54 +0200
commite0e6b516b9dd132e067a226ff7fdf56d3e556559 (patch)
tree73e93c4f41e441dbb1d06393ab0faab680b598fe
parent50e74b214133adddf252a7fb7962272508d6d03d (diff)
meta-externalagent, stop crawling all the ?eid= URLs
-rw-r--r--src/Main.hs43
1 files changed, 30 insertions, 13 deletions
diff --git a/src/Main.hs b/src/Main.hs
index 77dfa29..7cd2c65 100644
--- a/src/Main.hs
+++ b/src/Main.hs
@@ -426,6 +426,20 @@ page404 message = return $ responseLBS
[("Content-Type", "text/plain")]
message
+prefilter :: Request -> Maybe Response
+prefilter req
+ -- Facebook should stop crawling all the individual ?eid=... URLs
+ | case lookup "user-agent" (reqHeaders req) of
+ Just ua | BS.take 18 ua == "meta-externalagent" -> True
+ _ -> False
+ , any ((== "eid") . fst) (reqQuery req)
+ = Just $ responseBS (Status 303 "See Other")
+ [("Location", BS.takeWhile (/= fromIntegral (ord '?')) (reqURI req))]
+ BS.empty
+
+ | otherwise
+ = Nothing
+
mainServe :: FilePath -> IO ()
mainServe confpath = do
config <- enrichConfig <$> readConfig confpath
@@ -437,19 +451,22 @@ mainServe confpath = do
let settings = defaultSettings { setPort = confPort config }
atomicPrintS $ "Listening on port " ++ show (confPort config)
- run settings $ \req -> case reqPath req of
- [] ->
- pageIndex config
- ["log", TE.decodeUtf8' -> Right alias] ->
- pageLog config index req alias
- ["cal", TE.decodeUtf8' -> Right alias] ->
- pageCalendar config index alias
- ["cal", TE.decodeUtf8' -> Right alias, TE.decodeUtf8' -> Right date] ->
- pageCalendarDay config index req alias date
- [fname] | fname `elem` staticFiles ->
- return $ responseFile status200 [] ("pages/" ++ BS8.unpack fname)
- _ ->
- page404 "URL not found"
+ run settings $ \req ->
+ case prefilter req of
+ Just res -> return res
+ Nothing -> case reqPath req of
+ [] ->
+ pageIndex config
+ ["log", TE.decodeUtf8' -> Right alias] ->
+ pageLog config index req alias
+ ["cal", TE.decodeUtf8' -> Right alias] ->
+ pageCalendar config index alias
+ ["cal", TE.decodeUtf8' -> Right alias, TE.decodeUtf8' -> Right date] ->
+ pageCalendarDay config index req alias date
+ [fname] | fname `elem` staticFiles ->
+ return $ responseFile status200 [] ("pages/" ++ BS8.unpack fname)
+ _ ->
+ page404 "URL not found"
testParseLog :: FilePath -> IO ()
testParseLog fname = print =<< parseLog <$> BS.readFile fname