summaryrefslogtreecommitdiff
path: root/modules/vacancies/getter-helpers/umega-like.sh
blob: 061476b191d4b0336d2740c4c58effe398b3576f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/usr/bin/env bash
set -euo pipefail

URL=$1

mapfile inputlines <<<"$(elinks "$1")"


## Collect links

declare -A links
inlinkssection=0
for ((i=0; i<${#inputlines[@]}; i++)); do
  line=$(tr -d $'\n' <<<"${inputlines[$i]}")
  if grep '^\s*Visible links$' <<<"$line" >/dev/null; then
    inlinkssection=1
  elif [[ $inlinkssection -eq 1 ]] && grep '^\s*[0-9][0-9]*\. ' <<<"$line" >/dev/null; then
    linknumber=$(sed 's/^\s*\([0-9]*\)\..*/\1/' <<<"$line")
    linktext=$(sed 's/[^.]*\. //' <<<"$line")
    links[$linknumber]=$linktext
  fi
done


## Collect ads

totalnum=
counted=0

accum=

# 0: outside
# 1: in overview images section
# 2: in body
state=0

adlink=  # link for the ad currently being collected

function writeout() {
  if [[ $state -eq 2 && -n $accum ]]; then
    if [[ -n $adlink ]]; then echo "$accum ($adlink)"; else echo "$accum"; fi
    let counted+=1
  fi
  accum=
}

for ((i=0; i<${#inputlines[@]}; i++)); do
  line=$(tr -d $'\n' <<<"${inputlines[$i]}")

  # echo >&2 "?$state <$line>"
  m=$(sed -n 's/[^0-9]*\([0-9]*\) *[Pp]roperties for rent.*/\1/p' <<<"$line")
  if [[ -n $m ]]; then
    totalnum=$m
    continue
  fi

  if grep 'Overview image' <<<"$line" >/dev/null; then
    # echo >&2 "overview <$line>"
    writeout
    state=1
  elif [[ -z "$(tr -d '[:space:]' <<<"$line")" ]]; then
    # echo >&2 "empty <$line>"
    writeout
    state=0
    if [[ $counted -gt 0 ]]; then break; fi
  elif [[ $state -eq 1 ]] && grep '^\s*\[' <<<"$line" >/dev/null; then
    # echo >&2 "bodystart <$line>"
    state=2
    linknumber=$(sed 's/^\s*\[\([0-9]*\).*/\1/' <<<"$line")
    if [[ -n ${links[$linknumber]:+1} ]]; then adlink=${links[$linknumber]}; fi
    accum=$(sed 's/^\s*\[[0-9]*\]//' <<<"$line")
  elif [[ $state -eq 2 ]]; then
    # echo >&2 "bodycont <$line>"
    # c2 a0 is UTF-8 for a non-breaking space
    accum="$accum; $(sed 's/^\s*//; s/^\xc2\xa0//; s/\s*$//' <<<"$line")"
  fi
done

writeout

if [[ $counted != "$totalnum" ]]; then
  echo "SCRIPT BROKEN, OFFERS MAY BE MISSED"
fi