summaryrefslogtreecommitdiff
path: root/modules/vacancies/getter-helpers
diff options
context:
space:
mode:
authorTom Smeding <tom@tomsmeding.com>2026-02-27 22:54:50 +0100
committerTom Smeding <tom@tomsmeding.com>2026-02-27 22:54:50 +0100
commit90efd48e11f9ae6f6db64e2b101015d5b69f2fb5 (patch)
treee79732a88daeaa77370e3431a9c5c25ed25cacf7 /modules/vacancies/getter-helpers
parentefc5a694762ea88724f24e82eef24840d79819be (diff)
vacancies: Links in umega and elc getters
Diffstat (limited to 'modules/vacancies/getter-helpers')
-rwxr-xr-xmodules/vacancies/getter-helpers/umega-like.sh83
1 files changed, 83 insertions, 0 deletions
diff --git a/modules/vacancies/getter-helpers/umega-like.sh b/modules/vacancies/getter-helpers/umega-like.sh
new file mode 100755
index 0000000..061476b
--- /dev/null
+++ b/modules/vacancies/getter-helpers/umega-like.sh
@@ -0,0 +1,83 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+URL=$1
+
+mapfile inputlines <<<"$(elinks "$1")"
+
+
+## Collect links
+
+declare -A links
+inlinkssection=0
+for ((i=0; i<${#inputlines[@]}; i++)); do
+ line=$(tr -d $'\n' <<<"${inputlines[$i]}")
+ if grep '^\s*Visible links$' <<<"$line" >/dev/null; then
+ inlinkssection=1
+ elif [[ $inlinkssection -eq 1 ]] && grep '^\s*[0-9][0-9]*\. ' <<<"$line" >/dev/null; then
+ linknumber=$(sed 's/^\s*\([0-9]*\)\..*/\1/' <<<"$line")
+ linktext=$(sed 's/[^.]*\. //' <<<"$line")
+ links[$linknumber]=$linktext
+ fi
+done
+
+
+## Collect ads
+
+totalnum=
+counted=0
+
+accum=
+
+# 0: outside
+# 1: in overview images section
+# 2: in body
+state=0
+
+adlink= # link for the ad currently being collected
+
+function writeout() {
+ if [[ $state -eq 2 && -n $accum ]]; then
+ if [[ -n $adlink ]]; then echo "$accum ($adlink)"; else echo "$accum"; fi
+ let counted+=1
+ fi
+ accum=
+}
+
+for ((i=0; i<${#inputlines[@]}; i++)); do
+ line=$(tr -d $'\n' <<<"${inputlines[$i]}")
+
+ # echo >&2 "?$state <$line>"
+ m=$(sed -n 's/[^0-9]*\([0-9]*\) *[Pp]roperties for rent.*/\1/p' <<<"$line")
+ if [[ -n $m ]]; then
+ totalnum=$m
+ continue
+ fi
+
+ if grep 'Overview image' <<<"$line" >/dev/null; then
+ # echo >&2 "overview <$line>"
+ writeout
+ state=1
+ elif [[ -z "$(tr -d '[:space:]' <<<"$line")" ]]; then
+ # echo >&2 "empty <$line>"
+ writeout
+ state=0
+ if [[ $counted -gt 0 ]]; then break; fi
+ elif [[ $state -eq 1 ]] && grep '^\s*\[' <<<"$line" >/dev/null; then
+ # echo >&2 "bodystart <$line>"
+ state=2
+ linknumber=$(sed 's/^\s*\[\([0-9]*\).*/\1/' <<<"$line")
+ if [[ -n ${links[$linknumber]:+1} ]]; then adlink=${links[$linknumber]}; fi
+ accum=$(sed 's/^\s*\[[0-9]*\]//' <<<"$line")
+ elif [[ $state -eq 2 ]]; then
+ # echo >&2 "bodycont <$line>"
+ # c2 a0 is UTF-8 for a non-breaking space
+ accum="$accum; $(sed 's/^\s*//; s/^\xc2\xa0//; s/\s*$//' <<<"$line")"
+ fi
+done
+
+writeout
+
+if [[ $counted != "$totalnum" ]]; then
+ echo "SCRIPT BROKEN, OFFERS MAY BE MISSED"
+fi