#include "analyze_rms.h" | #include "analyze_rms.h" | ||||
#include <arm_math.h> | |||||
#include "utility/dspinst.h" | |||||
void AudioAnalyzeRMS::update(void) | void AudioAnalyzeRMS::update(void) | ||||
{ | { | ||||
audio_block_t *block; | |||||
int16_t rmsResult; | |||||
block = receiveReadOnly(); | |||||
if (!block) { | |||||
return; | |||||
} | |||||
// not reinventing the wheel: | |||||
// use DSP packed 32i instructions as found in arm_math.h, with 64b accumulator | |||||
arm_rms_q15(block->data, AUDIO_BLOCK_SAMPLES, &rmsResult); // seems to use ~2% CPU | |||||
lastRMS = rmsResult; // prevent threading issues | |||||
// for optimization, one could re-implement arm_rms_q15 to do the sqrt on read(). | |||||
// This way, the rms in dB could also be implemented faster: | |||||
// Instead of 20*log10(sqrt(MSerror)), one could do write 10*log10(MSerror) | |||||
new_output = true; | |||||
audio_block_t *block = receiveReadOnly(); | |||||
if (!block) return; | |||||
#if 1 | |||||
uint32_t *p = (uint32_t *)(block->data); | |||||
uint32_t *end = p + AUDIO_BLOCK_SAMPLES/2; | |||||
int64_t sum = accum; | |||||
do { | |||||
uint32_t n1 = *p++; | |||||
uint32_t n2 = *p++; | |||||
uint32_t n3 = *p++; | |||||
uint32_t n4 = *p++; | |||||
sum = multiply_accumulate_16tx16t_add_16bx16b(sum, n1, n1); | |||||
sum = multiply_accumulate_16tx16t_add_16bx16b(sum, n2, n2); | |||||
sum = multiply_accumulate_16tx16t_add_16bx16b(sum, n3, n3); | |||||
sum = multiply_accumulate_16tx16t_add_16bx16b(sum, n4, n4); | |||||
} while (p < end); | |||||
accum = sum; | |||||
count++; | |||||
#else | |||||
int16_t *p = block->data; | |||||
int16_t *end = p + AUDIO_BLOCK_SAMPLES; | |||||
int64_t sum = accum; | |||||
do { | |||||
int32_t n = *p++; | |||||
sum += n * n; | |||||
} while (p < end); | |||||
accum = sum; | |||||
count++; | |||||
#endif | |||||
release(block); | release(block); | ||||
} | } | ||||
float AudioAnalyzeRMS::read(void) | |||||
{ | |||||
__disable_irq(); | |||||
int64_t sum = accum; | |||||
accum = 0; | |||||
uint32_t num = count; | |||||
count = 0; | |||||
__enable_irq(); | |||||
float meansq = sum / (num * AUDIO_BLOCK_SAMPLES); | |||||
// TODO: shift down to 32 bits and use sqrt_uint32 | |||||
// but is that really any more efficient? | |||||
return sqrtf(meansq) / 32767.0; | |||||
} | |||||
{ | { | ||||
private: | private: | ||||
audio_block_t *inputQueueArray[1]; | audio_block_t *inputQueueArray[1]; | ||||
volatile bool new_output; | |||||
int16_t lastRMS; | |||||
int64_t accum; | |||||
uint32_t count; | |||||
public: | public: | ||||
AudioAnalyzeRMS(void) : AudioStream(1, inputQueueArray) { | AudioAnalyzeRMS(void) : AudioStream(1, inputQueueArray) { | ||||
lastRMS = 0; | |||||
accum = 0; | |||||
count = 0; | |||||
} | } | ||||
bool available(void) { | bool available(void) { | ||||
__disable_irq(); | |||||
bool flag = new_output; // we don't reset new_output here, because if you don't read it, | |||||
//it'll still be available on the next call of available() | |||||
// (different from AnalyzePeak behavior, which resets it in available()) | |||||
__enable_irq(); | |||||
return flag; | |||||
} | |||||
float read(void) { | |||||
__disable_irq(); | |||||
int rms = lastRMS; | |||||
new_output = false; // we can always set the new_output to false, even if it was false already | |||||
__enable_irq(); | |||||
return rms / 32767.0; | |||||
return count > 0; | |||||
} | } | ||||
float read(void); | |||||
virtual void update(void); | virtual void update(void); | ||||
}; | }; | ||||
return out; | return out; | ||||
} | } | ||||
// // computes sum += ((a[15:0] * b[15:0]) + (a[31:16] * b[31:16])) | |||||
static inline int64_t multiply_accumulate_16tx16t_add_16bx16b(int64_t sum, uint32_t a, uint32_t b) | |||||
{ | |||||
asm volatile("smlald %Q0, %R0, %1, %2" : "+r" (sum) : "r" (a), "r" (b)); | |||||
return sum; | |||||
} | |||||
// // computes sum += ((a[15:0] * b[31:16]) + (a[31:16] * b[15:0])) | |||||
static inline int64_t multiply_accumulate_16tx16b_add_16bx16t(int64_t sum, uint32_t a, uint32_t b) | |||||
{ | |||||
asm volatile("smlaldx %Q0, %R0, %1, %2" : "+r" (sum) : "r" (a), "r" (b)); | |||||
return sum; | |||||
} | |||||
// computes ((a[15:0] * b[15:0]) | // computes ((a[15:0] * b[15:0]) | ||||
static inline int32_t multiply_16bx16b(uint32_t a, uint32_t b) __attribute__((always_inline, unused)); | static inline int32_t multiply_16bx16b(uint32_t a, uint32_t b) __attribute__((always_inline, unused)); | ||||
static inline int32_t multiply_16bx16b(uint32_t a, uint32_t b) | static inline int32_t multiply_16bx16b(uint32_t a, uint32_t b) |