summaryrefslogtreecommitdiff
path: root/log-tz-convert-clog-PST.py
blob: c58561f1d5cf5b5749dc909f4e8ddf38a87beb40 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
#!/usr/bin/env python3
import re, sys, os, calendar, time

if len(sys.argv) != 3 or sys.argv[1] == "-h" or sys.argv[1] == "--help":
    print(f"Usage: {sys.argv[0]} <logdir> <outdir>")
    print("The <logdir> is expected to contain yy.mm.dd files in clog format (tunes.org/~nef/logs).")
    print("These will be converted from US/Pacific (i.e. PST/PDT) to UTC and to ZNC log format; the")
    print("output is written to <outdir>. The <logdir> is not changed.")
    if len(sys.argv) != 2: sys.exit(1)
    else: sys.exit(0)

logdir = sys.argv[1]
outdir = sys.argv[2]
os.mkdir(outdir)

class Logger:
    def __init__(self, destdir):
        self.destdir = destdir
        self.curdate = None  # (Y, m, d)
        self.events = []  # [((H, M, S), text)]

    def addevent(self, ymd, hms, text):
        if self.curdate is not None and self.curdate != ymd:
            self.writeout()

        if self.curdate is None:
            self.curdate = ymd
            self.events = [(hms, text)]
        else:
            assert self.curdate == ymd
            self.events.append((hms, text))

    def writeout(self):
        (Y, m, d) = self.curdate
        with open(os.path.join(self.destdir, f"{Y:04}-{m:02}-{d:02}.log"), "wb") as f:
            for (H, M, S), text in self.events:
                f.write(f"[{H:02}:{M:02}:{S:02}] ".encode("ascii") + text + b"\n")
        self.curdate = None
        self.events = []

# takes line from the '---' bit, i.e. after the HMS time
# returns None if this is a meta-marker
def convert_clog_to_znc(fname, line):
    m = re.match(b"^--- log: (started|ended|stopped) haskell/[0-9.]{8}$", line)
    if m is not None: return None
    m = re.match(b"^--- topic: '.*", line)
    if m is not None: return None
    m = re.match(b"^--- topic: set by .*", line)
    if m is not None: return None
    m = re.match(b"^--- names: list \\(.*", line)
    if m is not None: return None

    m = re.match(b"^--- join: ([^ ]*) \\(([^)]*)\\) joined #haskell$", line)
    if m is not None:
        return b"*** Joins: " + m[1] + b" (" + m[2] + b")"

    m = re.match(b"^--- join: ([^ ]*) joined #haskell$", line)  # join without host
    if m is not None:
        return b"*** Joins: " + m[1] + b" ()"

    m = re.match(b"^--- quit: ([^ ]*) \\((.*)\\)$", line)
    if m is not None:
        return b"*** Quits: " + m[1] + b" () (" + m[2] + b")"

    m = re.match(b"^--- part: ([^ ]*) left #haskell$", line)
    if m is not None:
        return b"*** Parts: " + m[1] + b" () ()"

    m = re.match(b"^--- topic: set to '(.*)' by ([^ ]*)$", line)
    if m is not None:
        return b"*** " + m[2] + b" changes topic to '" + m[1] + b"'"

    m = re.match(b"^--- nick: ([^ ]*) -> ([^ ]*)$", line)
    if m is not None:
        return b"*** " + m[1] + b" is now known as " + m[2]

    m = re.match(b"^--- mode: ([^ ]*) set mode: (.*)$", line)
    if m is not None:
        return b"*** " + m[1] + b" sets mode: " + m[2]

    m = re.match(b"^--- mode: ([^ ]*) set ([+-].*)$", line)
    if m is not None:
        return b"*** " + m[1] + b" sets mode: " + m[2]

    m = re.match(b"^--- kick: ([^ ]*) was kicked by ([^ ]*) \\((.*)\\)$", line)
    if m is not None:
        return b"*** " + m[0] + b" was kicked by " + m[1] + b" (" + m[2] + b")"

    m = re.match(b"^<([^>]*)> (.*)", line)
    if m is not None:
        return line

    m = re.match(b"^-([^(]*)\\(([^)]*)\\)- (.*)", line)  # notice
    if m is not None:
        return b"-" + m[0] + b"- " + m[2]

    m = re.match(b"^\\* ([^ ]*) (.*)", line)
    if m is not None:
        return b"* " + m[1] + b" " + m[2]

    m = re.match(b"^\\* ([^ ]*)$", line)  # empty action
    if m is not None:
        return b"* " + m[1] + b" "

    print(f"Cannot parse: {line!r} ({fname})")
    return b"#parseerror " + repr(line).encode("utf-8")

# returned text is None if this is no event but a meta-marker (log started or ended)
def parse_line(fname, line):
    space = line.find(b" ")
    if space != 8:
        print(f"Unparseable time: {line!r} ({fname})")
        return ((0, 0, 0), b"#parseerror " + repr(line).encode("utf-8"))
    timestr = line[:8].decode("ascii")
    try:
        (H, M, S) = re.match(r"([0-9]{2}):([0-9]{2}):([0-9]{2})$", timestr).groups()
    except Exception as e:
        raise Exception(f"Could not parse time marker on line: {line!r}")

    text = convert_clog_to_znc(fname, line[9:])
    return ((int(H), int(M), int(S)), text)

def last_sunday_in(Y, m):
    date = calendar.monthrange(Y, m)[1]  # number of days in month, i.e. last date
    while calendar.weekday(Y, m, date) != calendar.SUNDAY:
        date -= 1
    return date

def nth_sunday_in(n, Y, m):
    if n == -1:
        return last_sunday_in(Y, m)
    else:
        assert n >= 1
        date = 1
        while calendar.weekday(Y, m, date) != calendar.SUNDAY:
            date += 1
        return date + (n - 1) * 7

assert last_sunday_in(2026, 3) == 29
assert last_sunday_in(2026, 10) == 25

assert nth_sunday_in(2, 2026, 3) == 8
assert nth_sunday_in(1, 2026, 10) == 4

def convert_logfile(filename, logger):
    try:
        (Y, m, d) = re.match(r".*/([0-9]{2}).([0-9]{2}).([0-9]{2})$", filename).groups()
        Y = 2000 + int(Y)
        m = int(m)
        d = int(d)
    except Exception as e:
        raise Exception(f"Could not parse log file name {filename!r}")

    if Y <= 2006:
        # DST starts on the first Sunday in April at 02:00 PST
        enterdst = (4, nth_sunday_in(1, Y, 4))
        # DST ends on the last Sunday in October at 02:00 PDT, which isn't reached
        # and instead we continue from 01:00 PST
        leavedst = (10, last_sunday_in(Y, 10))
    else:
        # DST starts on the second Sunday in March at 02:00 PST
        enterdst = (3, nth_sunday_in(2, Y, 3))
        # DST ends on the first Sunday in October at 02:00 PDT, which isn't reached
        # and instead we continue from 01:00 PST
        leavedst = (11, nth_sunday_in(1, Y, 11))

    with open(filename, "rb") as f:
        prevHMS = None  # logged time
        prevYmdHMS2 = None  # UTC time
        switchedToWinter = None  # only ever used on 'leavedst' day

        for line in f:
            if line[-1] == b"\n"[0]: line = line[:-1]
            if line[-1] == b"\r"[0]: line = line[:-1]
            (H, M, S), text = parse_line(filename, line)
            if text is None: continue  # meta-marker

            if (m, d) < enterdst or \
                    ((m, d) == enterdst and (H, M, S) < (2, 0, 0)) or \
                    ((m, d) == leavedst and (H, M, S) >= (2, 0, 0)) or \
                    (m, d) > leavedst:
                # PST = UTC-8; add because interpreting a PST time as UTC
                # leaves you 8 hours in the past of what it should be
                timestamp = calendar.timegm((Y, m, d, H, M, S)) + 8*3600

            elif ((m, d) == enterdst and (H, M, S) >= (3, 0, 0)) or \
                    ((m, d) > enterdst and (m, d) < leavedst) or \
                    ((m, d) == leavedst and (H, M, S) < (1, 0, 0)):
                # PDT = UTC-7
                timestamp = calendar.timegm((Y, m, d, H, M, S)) + 7*3600

            elif (m, d) == enterdst and (2, 0, 0) <= (H, M, S) < (3, 0, 0):
                raise Exception(f"Invalid time in limbo between PST and PDT: {filename!r}: {line!r}")

            elif (m, d) == leavedst and (1, 0, 0) <= (H, M, S) < (2, 0, 0):
                # ambiguous interval: either before or after the PDT->PST switch
                if switchedToWinter is None: switchedToWinter = False  # mark that we entered here
                if prevHMS is not None and (H, M, S) < prevHMS:  # time warp means that's the switch
                    switchedToWinter = True
                if not switchedToWinter:
                    timestamp = calendar.timegm((Y, m, d, H, M, S)) + 7*3600  # PDT
                else:
                    timestamp = calendar.timegm((Y, m, d, H, M, S)) + 8*3600  # PST

            Y2, m2, d2, H2, M2, S2, *_ = time.gmtime(timestamp)
            if prevYmdHMS2 is not None and ((Y2, m2, d2), (H2, M2, S2)) < prevYmdHMS2:
                print(f"Time reversal: {line!r} ({filename})")
                logger.addevent(*prevYmdHMS2, b"#non-monotonic-time " + line)
            else:
                logger.addevent((Y2, m2, d2), (H2, M2, S2), text)
                prevYmdHMS2 = ((Y2, m2, d2), (H2, M2, S2))

            prevHMS = (H, M, S)

        if switchedToWinter == False:
            print(f"{filename!r}: ambiguous PDT->PST switch")

print("Converting")
logger = Logger(outdir)
for filename in sorted(os.listdir(logdir)):
    convert_logfile(os.path.join(logdir, filename), logger)
logger.writeout()

#  print("Checking")
#  def enumerate_channel(chandir):
#      events = []
#      for filename in sorted(os.listdir(chandir)):
#          with open(os.path.join(chandir, filename), "rb") as f:
#              lines = f.read().split(b"\n")
#          if lines[-1] == b"": lines = lines[:-1]

#          def compute_stamp(line):
#              timepart = line[:10].decode("ascii")
#              Y, m, d, H, M, S, *_ = re.match(r"^(....)-(..)-(..)\.log\|\[(..):(..):(..)\]$", filename + "|" + timepart).groups()
#              return calendar.timegm((int(Y), int(m), int(d), int(H), int(M), int(S)))

#          events += [(compute_stamp(line), line[line.index(b" "[0])+1:])
#                     for line in lines]
#      return events

#  for network in os.listdir(logdir):
#      for channel in os.listdir(os.path.join(logdir, network)):
#          src_events = enumerate_channel(os.path.join(logdir, network, channel))
#          out_events = enumerate_channel(os.path.join(outdir, network, channel))

#          assert len(src_events) == len(out_events)
#          assert [ev[1] for ev in src_events] == [ev[1] for ev in out_events]
#          out_prev_stamp = 0
#          for i in range(len(src_events)):
#              assert abs(src_events[i][0] - out_events[i][0]) in [3600, 7200]
#              if out_events[i][0] < out_prev_stamp:
#                  print(src_events[i])
#                  print(out_events[i])
#                  sys.exit(1)
#              out_prev_stamp = out_events[i][0]

#  print("OK")