summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore5
-rw-r--r--.gitmodules3
-rw-r--r--Makefile28
m---------cpp-window0
-rw-r--r--main.cpp261
5 files changed, 297 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..66746e6
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+obj/
+compile_commands.json
+*.mp4
+*.wav
+recog
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..b04a97b
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "cpp-window"]
+ path = cpp-window
+ url = git@tomsmeding.com:cpp-window
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..5e22976
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,28 @@
+CXX = g++
+CXXFLAGS = -Wall -Wextra -std=c++17 -O2 -g
+LDFLAGS = -lsndfile -lfftw3
+TARGET = recog
+
+CXXFLAGS += $(shell pkg-config --cflags sdl2)
+LDFLAGS += $(shell pkg-config --libs sdl2)
+
+OBJDIR = obj
+
+.PHONY: all clean
+
+all: $(TARGET)
+
+clean:
+ @echo "Cleaning"
+ @rm -f $(TARGET)
+ @rm -rf $(OBJDIR)
+
+
+$(OBJDIR)/%.o: %.cpp $(wildcard *.h)
+ @mkdir -p $(OBJDIR)
+ @echo "CXX $<"
+ @$(CXX) $(CXXFLAGS) -c -o $@ $<
+
+$(TARGET): $(patsubst %.cpp,$(OBJDIR)/%.o,$(wildcard *.cpp))
+ @echo "LD -o $@"
+ @$(CXX) -o $@ $^ $(LDFLAGS)
diff --git a/cpp-window b/cpp-window
new file mode 160000
+Subproject dccf7bf421ca62ec8f1c470be4aa72d60dd7201
diff --git a/main.cpp b/main.cpp
new file mode 100644
index 0000000..3a919c7
--- /dev/null
+++ b/main.cpp
@@ -0,0 +1,261 @@
+#include <iostream>
+#include <vector>
+#include <stdexcept>
+#include <complex>
+#include <cstdlib>
+#include <cstring>
+#include <cmath>
+#include <cassert>
+#include <sndfile.h>
+#include <fftw3.h>
+#include "cpp-window/window.h"
+
+
+class Audio {
+public:
+ Audio(const char *fname) {
+ SF_INFO sf_info;
+ sf_info.format = 0;
+ SNDFILE *sndfile = sf_open(fname, SFM_READ, &sf_info);
+ if (!sndfile) {
+ std::cerr << sf_strerror(nullptr) << std::endl;
+ exit(1);
+ }
+
+ nch = sf_info.channels;
+ nfr = sf_info.frames;
+ srate = sf_info.samplerate;
+
+ data.resize(nch * nfr);
+
+ size_t nread = sf_readf_double(sndfile, data.data(), nfr);
+ if (nread != nfr) throw std::runtime_error("Failed to parse audio file");
+
+ int err = sf_close(sndfile);
+ if (err != 0) throw std::runtime_error(sf_error_number(err));
+ }
+
+ size_t channels() const { return nch; }
+ size_t frames() const { return nfr; }
+ size_t sample_rate() const { return srate; }
+
+ double at(size_t channel, size_t frame) const {
+ assert(channel < nch && frame < nfr);
+ return data[nch * frame + channel];
+ }
+
+ class Channel;
+
+ Channel channel(size_t channel) const {
+ assert(channel < nch);
+ return Channel(*this, channel);
+ }
+
+ class Channel {
+ public:
+ double at(size_t frame) const { return audio.at(ch, frame); }
+ size_t frames() const { return audio.frames(); }
+ size_t sample_rate() const { return audio.sample_rate(); }
+
+ private:
+ Channel(const Audio &audio, size_t ch) : audio{audio}, ch{ch} {}
+
+ const Audio &audio;
+ const size_t ch;
+
+ friend Channel Audio::channel(size_t) const;
+ };
+
+private:
+ size_t srate, nch, nfr;
+ std::vector<double> data;
+};
+
+class FFTW {
+public:
+ static FFTW plan(size_t N) {
+ FFTW fftw{N};
+ fftw.in = fftw_alloc_real(N);
+ assert(fftw.in);
+ fftw.out = fftw_alloc_real(N);
+ assert(fftw.out);
+ fftw.pl = fftw_plan_r2r_1d(N, fftw.in, fftw.out, FFTW_R2HC, FFTW_MEASURE);
+ assert(fftw.pl);
+ return fftw;
+ }
+
+ ~FFTW() {
+ fftw_free(in);
+ fftw_free(out);
+ fftw_destroy_plan(pl);
+ }
+
+ inline size_t length() const { return N; }
+ inline double* input() { return in; }
+ inline double* output() { return out; }
+
+ void execute() { fftw_execute(pl); }
+
+private:
+ FFTW(size_t N) : N{N} {}
+
+ const size_t N;
+ double *in, *out;
+ fftw_plan pl;
+};
+
+class Spectrogram {
+public:
+ Spectrogram(const Audio::Channel &chan, size_t resol)
+ : resol{resol}
+ , samplerate{chan.sample_rate()}
+ , specs(chan.frames() / resol)
+ {
+ FFTW fftw = FFTW::plan(resol);
+
+ for (size_t i = 0; i < specs.size(); i++) {
+ for (size_t j = 0; j < resol; j++) {
+ fftw.input()[j] = chan.at(resol * i + j);
+ }
+
+ fftw.execute();
+
+ const double *output = fftw.output();
+ specs[i].resize(resol / 2 + 1);
+ specs[i][0] = output[0];
+ for (size_t j = 1; j < (resol + 1) / 2; j++) {
+ specs[i][j] = {output[j], output[resol - j]};
+ }
+ if (resol % 2 == 0) {
+ specs[i][resol / 2] = output[resol / 2];
+ }
+ }
+ }
+
+ size_t length() const { return specs.size(); }
+ size_t resolution() const { return resol; }
+
+ const std::vector<std::complex<double>>& at(size_t index) const {
+ return specs[index];
+ }
+
+ // Takes index into one spectrogram vector
+ double to_hz(size_t spec_idx) const {
+ return spec_idx * (double)samplerate / resol;
+ }
+
+ // Returns index into one spectrogram vector; might be out of range
+ size_t from_hz(double hz) const {
+ return hz * resol / samplerate;
+ }
+
+private:
+ const size_t resol;
+ const size_t samplerate;
+ std::vector<std::vector<std::complex<double>>> specs;
+};
+
+double hz_to_key(double hz) {
+ return log2(hz / 27.5) * 12 + 1;
+}
+
+double key_to_hz(double key) {
+ return pow(2, (key - 1) / 12) * 27.5;
+}
+
+int main(int argc, char **argv) {
+ if (argc != 2) {
+ std::cerr << "Usage: " << argv[0] << " <video.wav>" << std::endl;
+ return 1;
+ }
+
+ const char *audio_fname = argv[1];
+
+ std::cout << "Reading audio file..." << std::flush;
+ const Audio audio{audio_fname};
+ std::cout << " done" << std::endl;
+
+ std::cout << "Channels: " << audio.channels();
+ if (audio.channels() > 1) std::cout << " (choosing channel 0)";
+ std::cout << std::endl;
+
+ Audio::Channel chan = audio.channel(0);
+ std::cout << "Frames: " << chan.frames() << std::endl;
+ std::cout << "Sample rate: " << chan.sample_rate() << std::endl;
+ std::cout << "Duration: " << (double)chan.frames() / chan.sample_rate() << "s" << std::endl;
+
+ std::cout << "Creating spectrogram..." << std::flush;
+ Spectrogram spectro{chan, 4096};
+ std::cout << " done" << std::endl;
+
+ const double start_sec = 9, end_sec = 21;
+ // const double start_sec = 6*60+1, end_sec = 6*60+24;
+ const double start_frame = start_sec * chan.sample_rate();
+ const double end_frame = end_sec * chan.sample_rate();
+
+ Window{"Audio", 640, 480, Window::Opts{}.resizable(true)}.event_loop(
+ [&](const SDL_Event &e) {
+ if (e.type == SDL_KEYDOWN && e.key.keysym.sym == SDLK_q) {
+ return Window::ACT_STOP;
+ }
+ return Window::ACT_OK;
+ },
+ [&](Window::Buffer &buffer) {
+ // std::cout << "redraw" << std::endl;
+ buffer.clear({0, 0, 0});
+
+ using Clr = Window::Buffer::Clr;
+
+ const auto draw_bar = [&buffer](float x, float y1, float y2, Clr clr) {
+ buffer.plotf(x, y1, clr);
+ for (int y = std::ceil(y1); y <= y2; y++) buffer.plotf(x, y, clr);
+ buffer.plotf(x, y2, clr);
+ };
+
+ const double low_key = 1;
+ const double high_key = 88;
+ const double low_hz = key_to_hz(low_key);
+ const double high_hz = key_to_hz(high_key);
+ // + 1 because of rounding down
+ const size_t low_idx = std::max<size_t>(0, spectro.from_hz(low_hz) + 1);
+ const size_t high_idx = std::min(spectro.at(0).size(), spectro.from_hz(high_hz));
+
+ const auto key_to_y = [&buffer, &low_key, &high_key](double key) -> double {
+ return buffer.height() - 1 - (key - low_key) / (high_key - low_key) * (buffer.height() - 1);
+ };
+
+ // std::cout << "length = " << spectro.at(0).size() << std::endl;
+ // std::cout << "low_idx=" << low_idx << " high_idx=" << high_idx << std::endl;
+
+ // std::cout << "buffer height = " << buffer.height() << std::endl;
+
+ // const size_t key = high_key - 1;
+ // std::cout << key << ' ' << key_to_hz(key) << ' ' << spectro.from_hz(key_to_hz(key)) << ' ' << spectro.to_hz(spectro.from_hz(key_to_hz(key))) << ' ' << hz_to_key(spectro.to_hz(spectro.from_hz(key_to_hz(key)))) << std::endl;
+
+ for (int x = 0; x < buffer.width(); x++) {
+ const size_t si = (start_frame + x * (end_frame - start_frame) / (buffer.width() - 1)) / spectro.resolution();
+ const std::vector<std::complex<double>> &spec = spectro.at(si);
+
+ // std::cout << "x=" << x << " si=" << si << std::endl;
+
+ for (size_t j = low_idx; j < high_idx; j++) {
+ const double key1 = hz_to_key(spectro.to_hz(j));
+ const double key2 = hz_to_key(spectro.to_hz(j+1));
+ const float y1 = key_to_y(key1);
+ const float y2 = key_to_y(key2);
+ const float s = std::abs(spec[j]) / 8;
+ // std::cout << "y1=" << y1 << " y2=" << y2 << " j=" << j << " s=" << s << std::endl;
+ const float alpha = s / (s + 1);
+ const Clr clr = Clr{255*alpha, 100*alpha, 100*alpha};
+ draw_bar(x, y2, y1, clr);
+ }
+
+ if (x % 100 == 0) {
+ for (int key = 4; key <= 88; key += 12) {
+ buffer.plotf(x, key_to_y(key), Clr{255, 255, 255});
+ }
+ }
+ }
+ }
+ );
+}