From 90efd48e11f9ae6f6db64e2b101015d5b69f2fb5 Mon Sep 17 00:00:00 2001 From: Tom Smeding Date: Fri, 27 Feb 2026 22:54:50 +0100 Subject: vacancies: Links in umega and elc getters --- modules/vacancies/getter-helpers/umega-like.sh | 83 ++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100755 modules/vacancies/getter-helpers/umega-like.sh (limited to 'modules/vacancies/getter-helpers/umega-like.sh') diff --git a/modules/vacancies/getter-helpers/umega-like.sh b/modules/vacancies/getter-helpers/umega-like.sh new file mode 100755 index 0000000..061476b --- /dev/null +++ b/modules/vacancies/getter-helpers/umega-like.sh @@ -0,0 +1,83 @@ +#!/usr/bin/env bash +set -euo pipefail + +URL=$1 + +mapfile inputlines <<<"$(elinks "$1")" + + +## Collect links + +declare -A links +inlinkssection=0 +for ((i=0; i<${#inputlines[@]}; i++)); do + line=$(tr -d $'\n' <<<"${inputlines[$i]}") + if grep '^\s*Visible links$' <<<"$line" >/dev/null; then + inlinkssection=1 + elif [[ $inlinkssection -eq 1 ]] && grep '^\s*[0-9][0-9]*\. ' <<<"$line" >/dev/null; then + linknumber=$(sed 's/^\s*\([0-9]*\)\..*/\1/' <<<"$line") + linktext=$(sed 's/[^.]*\. //' <<<"$line") + links[$linknumber]=$linktext + fi +done + + +## Collect ads + +totalnum= +counted=0 + +accum= + +# 0: outside +# 1: in overview images section +# 2: in body +state=0 + +adlink= # link for the ad currently being collected + +function writeout() { + if [[ $state -eq 2 && -n $accum ]]; then + if [[ -n $adlink ]]; then echo "$accum ($adlink)"; else echo "$accum"; fi + let counted+=1 + fi + accum= +} + +for ((i=0; i<${#inputlines[@]}; i++)); do + line=$(tr -d $'\n' <<<"${inputlines[$i]}") + + # echo >&2 "?$state <$line>" + m=$(sed -n 's/[^0-9]*\([0-9]*\) *[Pp]roperties for rent.*/\1/p' <<<"$line") + if [[ -n $m ]]; then + totalnum=$m + continue + fi + + if grep 'Overview image' <<<"$line" >/dev/null; then + # echo >&2 "overview <$line>" + writeout + state=1 + elif [[ -z "$(tr -d '[:space:]' <<<"$line")" ]]; then + # echo >&2 "empty <$line>" + writeout + state=0 + if [[ $counted -gt 0 ]]; then break; fi + elif [[ $state -eq 1 ]] && grep '^\s*\[' <<<"$line" >/dev/null; then + # echo >&2 "bodystart <$line>" + state=2 + linknumber=$(sed 's/^\s*\[\([0-9]*\).*/\1/' <<<"$line") + if [[ -n ${links[$linknumber]:+1} ]]; then adlink=${links[$linknumber]}; fi + accum=$(sed 's/^\s*\[[0-9]*\]//' <<<"$line") + elif [[ $state -eq 2 ]]; then + # echo >&2 "bodycont <$line>" + # c2 a0 is UTF-8 for a non-breaking space + accum="$accum; $(sed 's/^\s*//; s/^\xc2\xa0//; s/\s*$//' <<<"$line")" + fi +done + +writeout + +if [[ $counted != "$totalnum" ]]; then + echo "SCRIPT BROKEN, OFFERS MAY BE MISSED" +fi -- cgit v1.2.3-70-g09d2