sleepy_discord/voice_connection.cpp

Source code

#include "voice_connection.h"
#ifdef SLEEPY_VOICE_ENABLED
#include <sodium.h>
#include "client.h"

namespace SleepyDiscord {
    VoiceConnection::VoiceConnection(BaseDiscordClient* client, VoiceContext& _context) :
        origin(client), context(_context), UDP(*origin), sSRC(0), port(0), nextTime(0),
#if !defined(NONEXISTENT_OPUS)
        encoder(nullptr), decoder(nullptr),
#endif
        secretKey()
    {}

    void VoiceConnection::disconnect() {
        stopSpeaking();
        std::string update;
        /*The number 103 comes from the number of letters in this string + 1:
        {"op":4,"d":{"guild_id":"18446744073709551615","channel_id":null,"self_mute":false,"self_deaf":false}}
        */
        update.reserve(103);
        update +=
            "{"
                "\"op\":4,"
                "\"d\":{"
                    "\"guild_id\":\""; update += context.serverID; update += "\","
                    "\"channel_id\":null,"
                    "\"self_mute\":false,"
                    "\"self_deaf\":false"
                "}"
            "}";
        origin->send(update, origin->connection);

        State oldState = state;
        state = static_cast<State>(state & ~State::CONNECTED);

        if (oldState & State::CONNECTED)
            origin->disconnect(1000, "", connection);
        if (heart.isValid())
            heart.stop(); //Kill
        speechTimer.stop();
        listenTimer.stop();
        //deal with raw pointers
        //Sorry about this c code, we are dealing with c libraries
#ifndef NONEXISTENT_OPUS
        if (encoder != nullptr) {
            opus_encoder_destroy(encoder);
            encoder = nullptr;
        }
        if (decoder != nullptr) {
            opus_decoder_destroy(decoder);
            decoder = nullptr;
        }
#endif // !NONEXISTENT_OPUS
    }

    void VoiceConnection::initialize() {
        if (state == NOT_CONNECTED)
            return;

        std::string resume;
        /*The number 77 comes from the number of letters in this string + 1:
        {"op":7,"d":{"server_id":"18446744073709551615","session_id":"","token":""}}
        */
        resume.reserve(77 + context.sessionID.length() + context.token.length());
        resume +=
            "{"
                "\"op\":7," //RESUME
                "\"d\":{"
                    "\"server_id\":\"" ; resume += context.serverID ; resume += "\","
                    "\"session_id\":\""; resume += context.sessionID; resume += "\","
                    "\"token\":\""     ; resume += context.token    ; resume += "\""
                "}"
            "}";
        origin->send(resume, origin->connection);
    }

    void VoiceConnection::processMessage(const std::string &message) {
        //json::Values values = json::getValues(message.c_str(),
        //  { "op", "d" });
        rapidjson::Document values;
        values.Parse(message.c_str(), message.length());

        VoiceOPCode op = static_cast<VoiceOPCode>(json::toInt(values["op"]));
        json::Value& d = values["d"];
        switch (op) {
        case HELLO: {
            heartbeatInterval = d["heartbeat_interval"].GetInt();

            //Don't sent a identity during resumes
            if (state & OPEN)
                break;

            std::string identity;
            /*The number 116 comes from the number of letters in this string + 1:
                {"op": 0,"d": {"server_id": "18446744073709551615",
                "user_id": "18446744073709551615","session_id": "","token": ""}}
            */
            //remember to change the number below when editing identity
            identity.reserve(116 + context.sessionID.length() + context.token.length());
            identity +=
                "{"
                    "\"op\": 0," //VoiceOPCode::IDENTIFY
                    "\"d\": {"
                        "\"server_id\": \"" ; identity += context.serverID ; identity += "\","
                        "\"user_id\": \""   ; identity += origin->getID()  ; identity += "\","
                        "\"session_id\": \""; identity += context.sessionID; identity += "\","
                        "\"token\": \""     ; identity += context.token    ; identity += "\""
                    "}"
                "}";
            origin->send(identity, connection);
            }
            state = static_cast<State>(state | CONNECTED);
            break;
        case READY: {
            //json::Values values = json::getValues(d->c_str(),
            //{ "ssrc", "port" });
            sSRC = d["ssrc"].GetUint();
            port = static_cast<uint16_t>(d["port"].GetUint());
            const json::Value& ipValue = d["ip"];
            std::string ip(ipValue.GetString(), ipValue.GetStringLength());
            //start heartbeating
            heartbeat();
            //connect to UDP
            UDP.connect(ip, port);
            //IP Discovery
            unsigned char packet[70] = { 0 };
            packet[0] = (sSRC >> 24) & 0xff;
            packet[1] = (sSRC >> 16) & 0xff;
            packet[2] = (sSRC >>  8) & 0xff;
            packet[3] = (sSRC      ) & 0xff;
            UDP.send(packet, 70);
            UDP.receive([&](const std::vector<uint8_t>& iPDiscovery) {
                //find start of string. 0x60 is a bitmask that should filter out non-letters
                //the ip is in ascii starting with the 4th byte and is null terminated
                std::vector<uint8_t>::const_iterator iPStart = iPDiscovery.begin() + 4;
                const std::string iPAddress(iPStart, std::find(iPStart, iPDiscovery.end(), 0));
                //send Select Protocol Payload
                std::string protocol;
                /*The number 101 comes from the number of letters in this string + 1:
                    {"op": 1,"d": {"protocol": "udp","data": {
                    "address": "","port": 65535,
                    "mode": "xsalsa20_poly1305"}}}
                */
                protocol.reserve(101 + iPAddress.length());
                protocol +=
                    "{"
                        "\"op\": 1," //VoiceOPCode::SELECT_PROTOCOL
                        "\"d\": {"
                            "\"protocol\": \"udp\","
                            "\"data\": {"
                                "\"address\": \""; protocol += iPAddress           ; protocol += "\","
                                "\"port\": "     ; protocol += std::to_string(port); protocol +=   ","
                                "\"mode\": \"xsalsa20_poly1305\""
                            "}"
                        "}"
                    "}";
                origin->send(protocol, connection);
            });
            }
            state = static_cast<State>(state | State::OPEN);
            break;
        case SESSION_DESCRIPTION: {
            consecutiveReconnectsCount = 0;  //succusful connection
            const json::Value& secretKeyJSON = d["secret_key"];
            json::Array secretKeyJSONArray = secretKeyJSON.GetArray();
            const std::size_t secretKeyJSONArraySize = secretKeyJSONArray.Size();
            for (std::size_t i = 0; i < secretKey.max_size() && i < secretKeyJSONArraySize; ++i) {
                    secretKey[i] = secretKeyJSONArray[i].GetUint() & 0xFF;
            }
            }
            state = static_cast<State>(state | State::AUDIO_ENABLED);
            if (context.eventHandler != nullptr)
                context.eventHandler->onReady(*this);
            break;
        case SPEAKING:
            if (context.eventHandler != nullptr)
                context.eventHandler->onSpeaking(*this);
        case RESUMED:
            consecutiveReconnectsCount = 0;
            heartbeat();
            break;
        case HEARTBEAT_ACK:
            if (context.eventHandler != nullptr)
                context.eventHandler->onHeartbeatAck(*this);
            break;
        default:
            break;
        }
    }

    void VoiceConnection::processCloseCode(const int16_t code) {
        State oldState = state;
        state = static_cast<State>(state & ~State::CONNECTED);

        switch (code) {
        case 1000: //normal closure
        case 1001:
        case VOICE_SERVER_CRASHED:
            if (oldState & State::CONNECTED)
                disconnect();
            getDiscordClient().removeVoiceConnectionAndContext(*this);
            return;
        default: break;
        }

        if (heart.isValid())
            heart.stop(); //Kill

        if (reconnectTimer.isValid()) //overwrite reconnect timer
            reconnectTimer.stop();
        reconnectTimer = origin->schedule([this]() {
            origin->connect(getWebSocketURI(context.endpoint), this, connection);
        }, getRetryDelay());
        ++consecutiveReconnectsCount;
    }

    void VoiceConnection::heartbeat() {
        //don't continue if not connected
        if (!(state & CONNECTED))
            return;

        //timestamp int
        const uint64_t bitMask52 = 0x1FFFFFFFFFFFFF;
        const uint64_t currentTime = static_cast<uint16_t>(origin->getEpochTimeMillisecond());
        const std::string nonce = std::to_string(bitMask52 & currentTime);
        /*The number 17 comes from the number of letters in this string + 1:
        {"op": 3, "d": }
        */
        std::string heartbeat;
        heartbeat.reserve(17 + nonce.length());
        heartbeat += 
            "{"
                "\"op\": 3, "
                "\"d\": "; heartbeat += nonce; heartbeat += 
            '}';
        origin->send(heartbeat, connection);

        if (context.eventHandler != nullptr)
            context.eventHandler->onHeartbeat(*this);

        heart = origin->schedule([this]() {
            this->heartbeat();
        }, heartbeatInterval);
    }

    inline void VoiceConnection::scheduleNextTime(AudioTimer& timer, TimedTask code, const time_t interval) {
        timer.nextTime += interval;
        time_t delay = timer.nextTime - origin->getEpochTimeMillisecond();
        delay = 0 < delay ? delay : 0;

        timer.timer = origin->schedule(code, delay);
    }

    void VoiceConnection::startSpeaking() {
        if ((state & State::ABLE) != State::ABLE) return;

        //init libraries if havn't and not encoded data
        //init sodium
        if (sodium_init() < 0)
            return;

        if (!audioSource->isOpusEncoded())
#if defined(NONEXISTENT_OPUS)
            return;
#else
            if (!(state & CAN_ENCODE) || encoder == nullptr) {
                //init opus
                int opusError = 0;
                encoder = opus_encoder_create(
                    /*Sampling rate(Hz)*/AudioTransmissionDetails::bitrate(),
                    /*Channels*/         AudioTransmissionDetails::channels(),
                    /*Mode*/             OPUS_APPLICATION_VOIP,
                    &opusError);
                if (opusError) {//error check
                    return;
                }
                state = static_cast<State>(state | State::CAN_ENCODE);
            }
#endif

        //say something
        sendSpeaking(true);
        state = static_cast<State>(state | State::SENDING_AUDIO);
        speechTimer.nextTime = origin->getEpochTimeMillisecond();
        speak();
    }

    void VoiceConnection::sendSpeaking(bool isNowSpeaking) {
        std::string ssrc = std::to_string(sSRC);
        /*The number 49 comes from 1 plus the length of this string
            {"op":5,"d":{"speaking":false,"delay":0,"ssrc":}}
        */
        std::string speaking;
        BasicAudioSourceForContainers::SpeakingFlag speakingFlag =
            isNowSpeaking ? audioSource->speakingFlag :
            static_cast< BasicAudioSourceForContainers::SpeakingFlag>(0);
        speaking.reserve(49 + ssrc.length());
        speaking +=
            "{"
                "\"op\":5,"
                "\"d\":{"
                    "\"speaking\":"; speaking += json::integer(speakingFlag); speaking += ","
                    "\"delay\":0,"
                    "\"ssrc\":"; speaking += ssrc; speaking +=
                "}"
            "}";
        origin->send(speaking, connection);
    }

    void VoiceConnection::speak() {
        //check that we are can still send audio data
        if ((state & State::ABLE) != State::ABLE)
            return;

        AudioTransmissionDetails details(context, samplesSentLastTime);

        std::size_t length = 0;

        //send the audio data
        if (audioSource->type == AUDIO_CONTAINER) {
            auto audioVectorSource = &static_cast<BasicAudioSourceForContainers&>(*audioSource);
            audioVectorSource->speak(*this, details, length);
        } else {
            AudioSample* audioBuffer = nullptr;
            audioSource->read(details, audioBuffer, length);
            speak(audioBuffer, length);
        }

        if ((state & SENDING_AUDIO) == 0) {
            sendSpeaking(false);
            context.eventHandler->onEndSpeaking(*this);
            return;
        }

        //schedule next send
        const time_t interval = static_cast<time_t>(
            (static_cast<float>(length) / static_cast<float>(
                AudioTransmissionDetails::bitrate() * AudioTransmissionDetails::channels()
            )) * 1000.0f
        );

        scheduleNextTime(speechTimer,
            [this]() {
                this->speak();
            }, interval
        );
    }

    void VoiceConnection::speak(AudioSample*& audioData, const std::size_t & length)  {
        samplesSentLastTime = 0;
        //This is only called in speak() so already checked that we can still send audio data

        //stop sending data when there's no data
        if (length == 0) {
            return stopSpeaking();
        } else if ((state & SENDING_AUDIO) == 0) {
            return;
        }

        //the >>1 cuts it in half since you are using 2 channels
        const std::size_t frameSize = length >> 1;

        if (!audioSource->isOpusEncoded()) {
#if defined(NONEXISTENT_OPUS)
            return;
#else
            //encode data
            constexpr opus_int32 encodedAudioMaxLength =
                static_cast<opus_int32>(AudioTransmissionDetails::proposedLength());
            unsigned char encodedAudioData[encodedAudioMaxLength]; //11.52 kilobytes
            opus_int32 encodedAudioLength = opus_encode(
                encoder, audioData, static_cast<int>(frameSize),
                encodedAudioData, encodedAudioMaxLength);
            //send it
            uint8_t * encodedAudioDataPointer = encodedAudioData;
            sendAudioData(encodedAudioDataPointer, encodedAudioLength, frameSize);
#endif
        } else {
            //encoded data should be in uint8
            sendAudioData(reinterpret_cast<uint8_t*&>(audioData), length, frameSize);
        }
    }

    void VoiceConnection::sendAudioData(
        uint8_t*& encodedAudioData,
        const std::size_t & length,
        const std::size_t & frameSize
    ) {
#ifndef NONEXISTENT_SODIUM
        ++sequence;
        constexpr int headerSize = 12;

        const uint8_t header[headerSize] = {
            0x80,
            0x78,
            static_cast<uint8_t>((sequence  >> (8 * 1)) & 0xff),
            static_cast<uint8_t>((sequence  >> (8 * 0)) & 0xff),
            static_cast<uint8_t>((timestamp >> (8 * 3)) & 0xff),
            static_cast<uint8_t>((timestamp >> (8 * 2)) & 0xff),
            static_cast<uint8_t>((timestamp >> (8 * 1)) & 0xff),
            static_cast<uint8_t>((timestamp >> (8 * 0)) & 0xff),
            static_cast<uint8_t>((sSRC      >> (8 * 3)) & 0xff),
            static_cast<uint8_t>((sSRC      >> (8 * 2)) & 0xff),
            static_cast<uint8_t>((sSRC      >> (8 * 1)) & 0xff),
            static_cast<uint8_t>((sSRC      >> (8 * 0)) & 0xff),
        };
            
        uint8_t nonce[nonceSize];
        std::memcpy(nonce                , header, sizeof header);
        std::memset(nonce + sizeof header,      0, sizeof nonce - sizeof header);
        
        const size_t numOfBtyes = sizeof header + length + crypto_secretbox_MACBYTES;
        std::vector<uint8_t> audioDataPacket(numOfBtyes);
        std::memcpy(audioDataPacket.data(), header, sizeof header);

        crypto_secretbox_easy(audioDataPacket.data() + sizeof header,
            encodedAudioData, length, nonce, secretKey.data());

        UDP.send(audioDataPacket.data(), audioDataPacket.size());
        samplesSentLastTime = frameSize << 1;
        timestamp += static_cast<uint32_t>(frameSize);
#else
    #error Can not use voice without libsodium, libsodium not detected.
#endif
    }

    //To do test this
    void VoiceConnection::startListening() {
        if (!(state & CAN_DECODE) || decoder == nullptr) {
            int opusError = 0;
            decoder = opus_decoder_create(
                /*Sampling rate(Hz)*/AudioTransmissionDetails::bitrate(),
                /*Channels*/         AudioTransmissionDetails::channels(),
                &opusError);
            if (opusError) {//error check
                return;
            }
        }
        listen();
    }

    void VoiceConnection::listen() {
        UDP.receive([this](const std::vector<uint8_t>& data){
            processIncomingAudio(data);
        });

        scheduleNextTime(listenTimer,
            [this]() {
                this->listen();
            },  AudioTransmissionDetails::proposedLengthOfTime()
        );
    }

    void VoiceConnection::processIncomingAudio(const std::vector<uint8_t>& data)
    {
#if !defined(NONEXISTENT_SODIUM) || !defined(NONEXISTENT_OPUS)
        //get nonce
        uint8_t nonce[nonceSize];
        std::memcpy(nonce, data.data(), sizeof nonce);
        //decrypt
        std::vector<uint8_t> decryptedData;
        const std::size_t decryptedDataSize = data.size() - sizeof nonce;
        decryptedData.reserve(decryptedDataSize);
        bool isForged = crypto_secretbox_open_easy(
            decryptedData.data(),
            data.data() + sizeof nonce,
            decryptedDataSize, nonce, secretKey.data()
        ) != 0;
        if (isForged)
            return;
        //decode
        constexpr opus_int32 frameSize = 
            static_cast<opus_int32>(AudioTransmissionDetails::proposedLength());
        BaseAudioOutput::Container decodedAudioData;
        opus_int32 decodedAudioLength = opus_decode(
            decoder, decryptedData.data(), static_cast<int>(decryptedData.size()),
            decodedAudioData.data(), frameSize, 1);
        if(decodedAudioLength < OPUS_OK || !hasAudioOutput())
            return;
        AudioTransmissionDetails details(context, 0);
        audioOutput->write(decodedAudioData, details);
#endif
    }
}
#else
void SleepyDiscord::VoiceConnection::initialize() {}
void SleepyDiscord::VoiceConnection::processMessage(const std::string &/*message*/) {}
void SleepyDiscord::VoiceConnection::processCloseCode(const int16_t /*code*/) {}
#endif
Updated on 13 April 2022 at 18:39:59 UTC
Source code​

Source code