Correct Implementation of WASAPI?

I'm trying to implement a WASAPI backend for Windows and I saw this thread from earlier with this code example. But Martins says this isn't the right way to use WASAPI. So my main question is why not?

--

Here is my attempt at an implementation. My main question is: is this a correct implementation of a WASAPI backend?

//
// File: wasapi.cpp
//
// Build command:
// cl /MD /Od -nologo -Zo -Z7 wasapi.cpp /link -subsystem:windows -incremental:no -opt:ref -OUT:wasapi.exe
//

#define WIN32_LEAN_AND_MEAN
#define VC_EXTRALEAN
#define NOMINMAX
#include <windows.h>

#include <mmsystem.h>
#include <mmdeviceapi.h>
#include <audioclient.h>

#include <stdint.h>
typedef uint16_t  u16;
typedef int16_t   i16;
typedef uint32_t  u32;
typedef int32_t   i32;
typedef uint64_t  u64;
typedef int64_t   i64;
typedef float     f32;
typedef double    f64;

#define _USE_MATH_DEFINES
#include <math.h>
#include <assert.h>

#define TAU (M_PI * 2)

#pragma comment(lib, "ole32.lib")
#pragma comment(lib, "winmm.lib")

#define SafeRelease(ppT) do { if (*ppT) { (*ppT)->Release(); *ppT = NULL; } } while(0)


static void output_test_sine_wave_f32(i32 samples_per_second, i32 sample_count, f32 *samples, f32 tone_hz = 440, f32 tone_volume = 0.5) {
  static f64 t_sine = 0;
  int wave_period = samples_per_second / tone_hz;

  f32 *sample_out = samples;
  for (int sample_index = 0; sample_index < sample_count; sample_index++) {
    // TODO(casey): Draw this out for people
    f32 sine_value = sin(t_sine);
    f32 sample_value = (f32)(sine_value * tone_volume);
    *sample_out++ = sample_value;
    *sample_out++ = sample_value;

    t_sine += TAU / (f32)wave_period;
    if (t_sine >= TAU) {
      t_sine -= TAU;
    }
  }
}

DWORD wasapi__run(void *Passthrough)
{
    HRESULT hr;
    hr = CoInitializeEx(0, COINIT_SPEED_OVER_MEMORY);
    assert(SUCCEEDED(hr));

    IMMDevice* pDevice = NULL;
    IMMDeviceEnumerator* mmDeviceEnumerator = NULL;

    hr = CoCreateInstance(__uuidof(MMDeviceEnumerator), NULL, CLSCTX_INPROC_SERVER, IID_PPV_ARGS(&mmDeviceEnumerator));
    assert(SUCCEEDED(hr));

    hr = mmDeviceEnumerator->GetDefaultAudioEndpoint(eRender, eConsole, &pDevice);
    assert(SUCCEEDED(hr));

    IAudioClient* pAudioClient = NULL;
    hr = pDevice->Activate(__uuidof(IAudioClient), CLSCTX_INPROC_SERVER, NULL, (void**)(&pAudioClient));
    assert(SUCCEEDED(hr));

    WAVEFORMATEX* pMixFormat = NULL;
    pAudioClient->GetMixFormat(&pMixFormat);
    assert(pMixFormat->nChannels == 2);
    assert(pMixFormat->wBitsPerSample == 32);


    REFERENCE_TIME BufferDuration = 30 * 10000; // 30ms
    DWORD flags = AUDCLNT_STREAMFLAGS_EVENTCALLBACK | AUDCLNT_STREAMFLAGS_NOPERSIST;
    hr = pAudioClient->Initialize(AUDCLNT_SHAREMODE_SHARED, flags, BufferDuration, 0, pMixFormat, NULL);
    assert(SUCCEEDED(hr));

    IAudioRenderClient* pRenderClient = NULL;
    hr = pAudioClient->GetService(IID_PPV_ARGS(&pRenderClient));
    assert(SUCCEEDED(hr));

    UINT32 bufferFrameCount = 0;
    hr = pAudioClient->GetBufferSize(&bufferFrameCount);
    assert(SUCCEEDED(hr));

    HANDLE hRefillEvent = CreateEventEx(NULL, NULL, 0, EVENT_MODIFY_STATE | SYNCHRONIZE);
    hr = pAudioClient->SetEventHandle(hRefillEvent);
    assert(SUCCEEDED(hr));

    {
      BYTE *data = NULL;
      hr = pRenderClient->GetBuffer(bufferFrameCount, &data);
      assert(SUCCEEDED(hr));

      hr = pRenderClient->ReleaseBuffer(bufferFrameCount, AUDCLNT_BUFFERFLAGS_SILENT);
      assert(SUCCEEDED(hr));
    }

    u64 total_frames_written = 0;

    hr = pAudioClient->Start();
    while (true) {
      auto res = WaitForSingleObject(hRefillEvent, INFINITE);

      if (res == WAIT_OBJECT_0)
      {
        #if 1
        {
          IAudioClock* pAudioClock = NULL;
          pAudioClient->GetService(IID_PPV_ARGS(&pAudioClock));
          if (pAudioClock)
          {
              UINT64 freq = 0;
              pAudioClock->GetFrequency(&freq);
              UINT64 position = 0;
              pAudioClock->GetPosition(&position, NULL);
              double sec = (double)(position) / (double)(freq);

              u64 total_bytes_written = total_frames_written * pMixFormat->nBlockAlign;
              //printf("AudioClock: freq=%lldHz, pos=%10lld, %10.5fsec -- Total Bytes Written: %10lld \n", freq, position, sec, total_bytes_written);
              //assert(total_bytes_written >= position);
              pAudioClock->Release();
          }
        }
        #endif

        bool default_device_changed = false;
        {
            IMMDevice* currentDefaultDevice = NULL;
            mmDeviceEnumerator->GetDefaultAudioEndpoint(eRender, eConsole, &currentDefaultDevice);

            LPWSTR id1;
            LPWSTR id2;
            pDevice->GetId(&id1);
            currentDefaultDevice->GetId(&id2);

            default_device_changed = wcscmp(id1, id2) != 0;

            CoTaskMemFree(id1);
            CoTaskMemFree(id2);
            currentDefaultDevice->Release();
        }

        // See how much buffer space is available.
        UINT32 numFramesPadding = 0;
        HRESULT hr = pAudioClient->GetCurrentPadding(&numFramesPadding);

        // NOTE(nick): check for device change
        if (hr == AUDCLNT_E_DEVICE_INVALIDATED || default_device_changed)
        {
            hr = pAudioClient->Stop();
            assert(SUCCEEDED(hr));

            SafeRelease(&pRenderClient);
            SafeRelease(&pAudioClient);
            SafeRelease(&pDevice);

            {
                hr = mmDeviceEnumerator->GetDefaultAudioEndpoint(eRender, eConsole, &pDevice);
                assert(SUCCEEDED(hr));

                hr = pDevice->Activate(__uuidof(IAudioClient), CLSCTX_INPROC_SERVER, NULL, (void**)(&pAudioClient));
                assert(SUCCEEDED(hr));

                pAudioClient->GetMixFormat(&pMixFormat);
                assert(pMixFormat->nChannels == 2);
                assert(pMixFormat->wBitsPerSample == 32);

                hr = pAudioClient->Initialize(AUDCLNT_SHAREMODE_SHARED, flags, BufferDuration, 0, pMixFormat, NULL);
                assert(SUCCEEDED(hr));

                hr = pAudioClient->GetService(IID_PPV_ARGS(&pRenderClient));
                assert(SUCCEEDED(hr));

                hr = pAudioClient->GetBufferSize(&bufferFrameCount);
                assert(SUCCEEDED(hr));

                hr = pAudioClient->SetEventHandle(hRefillEvent);
                assert(SUCCEEDED(hr));
            } 

            hr = pAudioClient->Start();
            continue;
        }

        // NOTE(nick): output sound
        UINT32 SampleCount = bufferFrameCount - numFramesPadding;
        if (SampleCount > 0)
        {
          // Grab all the available space in the shared buffer.
          BYTE *data = NULL;
          hr = pRenderClient->GetBuffer(SampleCount, &data);

          f32* fData = (f32 *)data;

          auto SamplesPerSecond = pMixFormat->nSamplesPerSec;
          f32 *samples = (f32 *)data;
          {
            output_test_sine_wave_f32(SamplesPerSecond, SampleCount, samples, 440, 0.5);
          }

          hr = pRenderClient->ReleaseBuffer(SampleCount, 0);
          total_frames_written += SampleCount;
        }
      }
    }
}

int APIENTRY WinMain(HINSTANCE instance, HINSTANCE prev_inst, LPSTR argv, int argc)
{
    HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE);
    if (!stdout_handle) {
        AttachConsole(ATTACH_PARENT_PROCESS);
    }

    DWORD thread_id = 0;
    HANDLE handle = CreateThread(0, 0, wasapi__run, 0, 0, &thread_id);
    SetThreadPriority(handle, THREAD_PRIORITY_TIME_CRITICAL);
    WaitForSingleObject(handle, INFINITE);
}

Note that I haven't implemented arbitrary support for different mix formats. I know that the WASAPI API really wants you to use an IAudioSessionEvents instance to listen for events, but I much prefer straight-line code, so I tried to check for the default device change event manually.

I'm also not totally sure about the SampleCount / frame timing. Should I do something more similar to what Casey did with DirectSound? Or is this mostly handled by the GetBuffer / ReleaseBuffer calls?

Any feedback would be helpful!

You can pass AUDCLNT_STREAMFLAGS_AUTOCONVERTPCM flag to Initialize() and will automatically convert whatever format you want to native mix format.

Doing wasapi like this with separate thread which waits on event signaling when you need to provide more buffer data is a good way.

My comment on bad usage was to try to adapt it to DirectSound way of expecting to prepare more data for next frame - that is not good way of using wasapi.

Btw we recently had discussion about using WASAPI with multithreading on discord: https://discord.com/channels/239737791225790464/981439725275611146 (invite link on top of the page).


Edited by Mārtiņš Možeiko on

Thanks for the clarification Martins! I didn't know about AUDCLNT_STREAMFLAGS_AUTOCONVERTPCM, that's definitely something I will look into. Also will check out the thread you mentioned on Discord.