summaryrefslogtreecommitdiff
path: root/log-tz-convert.py
blob: 85d3f54fd421ff01dfa21cac42aaae6884cd0d53 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#!/usr/bin/env python3
import re, sys, os, calendar, time

if len(sys.argv) != 3 or sys.argv[1] == "-h" or sys.argv[1] == "--help":
    print(f"Usage: {sys.argv[0]} <logdir> <outdir>")
    print("The <logdir> is expected to contain network/#channel/YYYY-mm-dd.log files.")
    print("These will be converted from Europe/Amsterdam (i.e. CET/CEST) to UTC; the")
    print("output is written to <outdir>. The <logdir> is not changed.")
    if len(sys.argv) != 2: sys.exit(1)
    else: sys.exit(0)

logdir = sys.argv[1]
outdir = sys.argv[2]
os.mkdir(outdir)

class Logger:
    def __init__(self, destdir):
        self.destdir = destdir
        self.curdate = None  # (Y, m, d)
        self.events = []  # [((H, M, S), text)]

    def addevent(self, ymd, hms, text):
        if self.curdate is not None and self.curdate != ymd:
            self.writeout()

        if self.curdate is None:
            self.curdate = ymd
            self.events = [(hms, text)]
        else:
            assert self.curdate == ymd
            self.events.append((hms, text))

    def writeout(self):
        (Y, m, d) = self.curdate
        with open(os.path.join(self.destdir, f"{Y:04}-{m:02}-{d:02}.log"), "wb") as f:
            for (H, M, S), text in self.events:
                f.write(f"[{H:02}:{M:02}:{S:02}] ".encode("ascii") + text + b"\n")
        self.curdate = None
        self.events = []

def parse_line(line):
    if line[-1] == b"\n"[0]: line = line[:-1]
    space = line.find(b" ")
    if space != 10: raise Exception(f"No space found in right spot on line: {line!r}")
    timestr = line[:10].decode("ascii")
    try:
        (H, M, S) = re.match(r"\[([0-9]{2}):([0-9]{2}):([0-9]{2})\]$", timestr).groups()
    except Exception as e:
        raise Exception(f"Could not parse time marker on line: {line!r}")

    text = line[11:]
    return ((int(H), int(M), int(S)), text)

def last_sunday_in(Y, m):
    date = calendar.monthrange(Y, m)[1]  # number of days in month, i.e. last date
    while calendar.weekday(Y, m, date) != calendar.SUNDAY:
        date -= 1
    return date

assert last_sunday_in(2026, 3) == 29
assert last_sunday_in(2026, 10) == 25

def convert_logfile(filename, logger):
    try:
        (Y, m, d) = re.match(r".*/([0-9]{4})-([0-9]{2})-([0-9]{2})\.log$", filename).groups()
        Y = int(Y)
        m = int(m)
        d = int(d)
    except Exception as e:
        raise Exception(f"Could not parse log file name {filename!r}")

    # DST starts on the last Sunday in March at 02:00 CET
    enterdst = (3, last_sunday_in(Y, 3))
    # DST ends on the last Sunday in October at 03:00 CEST, which isn't reached
    # and instead we continue from 02:00 CET
    leavedst = (10, last_sunday_in(Y, 10))

    with open(filename, "rb") as f:
        prevHMS = None
        switchedToCET = None  # only ever used on 'leavedst' day

        for line in f:
            (H, M, S), text = parse_line(line)
            if (m, d) < enterdst or \
                    ((m, d) == enterdst and (H, M, S) < (2, 0, 0)) or \
                    ((m, d) == leavedst and (H, M, S) >= (3, 0, 0)) or \
                    (m, d) > leavedst:
                # CET = UTC+1; subtract because interpreting a CET time as UTC
                # leaves you an hour too far in the future
                timestamp = calendar.timegm((Y, m, d, H, M, S)) - 3600

            elif ((m, d) == enterdst and (H, M, S) >= (3, 0, 0)) or \
                    ((m, d) > enterdst and (m, d) < leavedst) or \
                    ((m, d) == leavedst and (H, M, S) < (2, 0, 0)):
                # CEST = UTC+2
                timestamp = calendar.timegm((Y, m, d, H, M, S)) - 2*3600

            elif (m, d) == enterdst and (2, 0, 0) <= (H, M, S) < (3, 0, 0):
                raise f"Invalid time in limbo between CET and CEST: {f!r}: {line!r}"

            elif (m, d) == leavedst and (2, 0, 0) <= (H, M, S) < (3, 0, 0):
                # ambiguous interval: either before or after the CEST->CET switch
                if switchedToCET is None: switchedToCET = False  # mark that we entered here
                if (H, M, S) < prevHMS: switchedToCET = True  # time warp means that's the switch
                if not switchedToCET:
                    timestamp = calendar.timegm((Y, m, d, H, M, S)) - 2*3600
                else:
                    timestamp = calendar.timegm((Y, m, d, H, M, S)) - 3600

            Y2, m2, d2, H2, M2, S2, *_ = time.gmtime(timestamp)
            logger.addevent((Y2, m2, d2), (H2, M2, S2), text)

            prevHMS = (H, M, S)

        if switchedToCET == False:
            print(f"{filename!r}: ambiguous CEST->CET switch")

print("Converting")
for network in os.listdir(logdir):
    os.mkdir(os.path.join(outdir, network))
    for channel in os.listdir(os.path.join(logdir, network)):
        os.mkdir(os.path.join(outdir, network, channel))
        logger = Logger(os.path.join(outdir, network, channel))
        for filename in sorted(os.listdir(os.path.join(logdir, network, channel))):
            convert_logfile(os.path.join(logdir, network, channel, filename), logger)
        logger.writeout()

print("Checking")
def enumerate_channel(chandir):
    events = []
    for filename in sorted(os.listdir(chandir)):
        with open(os.path.join(chandir, filename), "rb") as f:
            lines = f.read().split(b"\n")
        if lines[-1] == b"": lines = lines[:-1]

        def compute_stamp(line):
            timepart = line[:10].decode("ascii")
            Y, m, d, H, M, S, *_ = re.match(r"^(....)-(..)-(..)\.log\|\[(..):(..):(..)\]$", filename + "|" + timepart).groups()
            return calendar.timegm((int(Y), int(m), int(d), int(H), int(M), int(S)))

        events += [(compute_stamp(line), line[line.index(b" "[0])+1:])
                   for line in lines]
    return events

for network in os.listdir(logdir):
    for channel in os.listdir(os.path.join(logdir, network)):
        src_events = enumerate_channel(os.path.join(logdir, network, channel))
        out_events = enumerate_channel(os.path.join(outdir, network, channel))

        assert len(src_events) == len(out_events)
        assert [ev[1] for ev in src_events] == [ev[1] for ev in out_events]
        out_prev_stamp = 0
        for i in range(len(src_events)):
            assert abs(src_events[i][0] - out_events[i][0]) in [3600, 7200]
            if out_events[i][0] < out_prev_stamp:
                print(src_events[i])
                print(out_events[i])
                sys.exit(1)
            out_prev_stamp = out_events[i][0]

print("OK")