cras/src/dsp/dsp_util.c - platform/external/adhd - Git at Google

 /* Copyright 2013 The Chromium OS Authors. All rights reserved.
  * Use of this source code is governed by a BSD-style license that can be
  * found in the LICENSE file.
  */

 #include <limits.h>
 #include <syslog.h>

 #include "dsp_util.h"

 #ifndef max
 #define max(a, b)                                                              \
 	({                                                                     \
 		__typeof__(a) _a = (a);                                        \
 		__typeof__(b) _b = (b);                                        \
 		_a > _b ? _a : _b;                                             \
 	})
 #endif

 #ifndef min
 #define min(a, b)                                                              \
 	({                                                                     \
 		__typeof__(a) _a = (a);                                        \
 		__typeof__(b) _b = (b);                                        \
 		_a < _b ? _a : _b;                                             \
 	})
 #endif

 #undef deinterleave_stereo
 #undef interleave_stereo

 /* Converts shorts in range of -32768 to 32767 to floats in range of
  * -1.0f to 1.0f.
  * scvtf instruction accepts fixed point ints, so sxtl is used to lengthen
  * shorts to int with sign extension.
  */
 #ifdef __aarch64__
 static void deinterleave_stereo(int16_t *input, float *output1, float *output2,
 				int frames)
 {
 	int chunk = frames >> 3;
 	frames &= 7;
 	/* Process 8 frames (16 samples) each loop. */
 	/* L0 R0 L1 R1 L2 R2 L3 R3... -> L0 L1 L2 L3... R0 R1 R2 R3... */
 	if (chunk) {
 		// clang-format off
 		__asm__ __volatile__(
 			"1:                                         \n"
 			"ld2  {v2.8h, v3.8h}, [%[input]], #32       \n"
 			"subs %w[chunk], %w[chunk], #1              \n"
 			"sxtl   v0.4s, v2.4h                        \n"
 			"sxtl2  v1.4s, v2.8h                        \n"
 			"sxtl   v2.4s, v3.4h                        \n"
 			"sxtl2  v3.4s, v3.8h                        \n"
 			"scvtf  v0.4s, v0.4s, #15                   \n"
 			"scvtf  v1.4s, v1.4s, #15                   \n"
 			"scvtf  v2.4s, v2.4s, #15                   \n"
 			"scvtf  v3.4s, v3.4s, #15                   \n"
 			"st1    {v0.4s, v1.4s}, [%[output1]], #32   \n"
 			"st1    {v2.4s, v3.4s}, [%[output2]], #32   \n"
 			"b.ne   1b                                  \n"
 			: /* output */
 			  [chunk]"+r"(chunk),
 			  [input]"+r"(input),
 			  [output1]"+r"(output1),
 			  [output2]"+r"(output2)
 			: /* input */
 			: /* clobber */
 			  "v0", "v1", "v2", "v3", "memory", "cc");
 		// clang-format on
 	}

 	/* The remaining samples. */
 	while (frames--) {
 		*output1++ = *input++ / 32768.0f;
 		*output2++ = *input++ / 32768.0f;
 	}
 }
 #define deinterleave_stereo deinterleave_stereo

 /* Converts floats in range of -1.0f to 1.0f to shorts in range of
  * -32768 to 32767 with rounding to nearest, with ties (0.5) rounding away
  * from zero.
  * Rounding is achieved by using fcvtas instruction. (a = away)
  * The float scaled to a range of -32768 to 32767 by adding 15 to the exponent.
  * Add to exponent is equivalent to multiply for exponent range of 0 to 239,
  * which is 2.59 * 10^33.  A signed saturating add (sqadd) limits exponents
  * from 240 to 255 to clamp to 255.
  * For very large values, beyond +/- 2 billion, fcvtas will clamp the result
  * to the min or max value that fits an int.
  * For other values, sqxtn clamps the output to -32768 to 32767 range.
  */
 static void interleave_stereo(float *input1, float *input2, int16_t *output,
 			      int frames)
 {
 	/* Process 4 frames (8 samples) each loop. */
 	/* L0 L1 L2 L3, R0 R1 R2 R3 -> L0 R0 L1 R1, L2 R2 L3 R3 */
 	int chunk = frames >> 2;
 	frames &= 3;

 	if (chunk) {
 		// clang-format off
 		__asm__ __volatile__(
 			"dup    v2.4s, %w[scale]                    \n"
 			"1:                                         \n"
 			"ld1    {v0.4s}, [%[input1]], #16           \n"
 			"ld1    {v1.4s}, [%[input2]], #16           \n"
 			"subs   %w[chunk], %w[chunk], #1            \n"
 			"sqadd  v0.4s, v0.4s, v2.4s                 \n"
 			"sqadd  v1.4s, v1.4s, v2.4s                 \n"
 			"fcvtas v0.4s, v0.4s                        \n"
 			"fcvtas v1.4s, v1.4s                        \n"
 			"sqxtn  v0.4h, v0.4s                        \n"
 			"sqxtn  v1.4h, v1.4s                        \n"
 			"st2    {v0.4h, v1.4h}, [%[output]], #16    \n"
 			"b.ne   1b                                  \n"
 			: /* output */
 			  [chunk]"+r"(chunk),
 			  [input1]"+r"(input1),
 			  [input2]"+r"(input2),
 			  [output]"+r"(output)
 			: /* input */
 			  [scale]"r"(15 << 23)
 			: /* clobber */
 			  "v0", "v1", "v2", "memory", "cc");
 		// clang-format on
 	}

 	/* The remaining samples */
 	while (frames--) {
 		float f;
 		f = *input1++ * 32768.0f;
 		f += (f >= 0) ? 0.5f : -0.5f;
 		*output++ = max(-32768, min(32767, (int)(f)));
 		f = *input2++ * 32768.0f;
 		f += (f >= 0) ? 0.5f : -0.5f;
 		*output++ = max(-32768, min(32767, (int)(f)));
 	}
 }
 #define interleave_stereo interleave_stereo
 #endif

 #ifdef __ARM_NEON__
 #include <arm_neon.h>

 static void deinterleave_stereo(int16_t *input, float *output1, float *output2,
 				int frames)
 {
 	/* Process 8 frames (16 samples) each loop. */
 	/* L0 R0 L1 R1 L2 R2 L3 R3... -> L0 L1 L2 L3... R0 R1 R2 R3... */
 	int chunk = frames >> 3;
 	frames &= 7;
 	if (chunk) {
 		// clang-format off
 		__asm__ __volatile__(
 			"1:					    \n"
 			"vld2.16 {d0-d3}, [%[input]]!		    \n"
 			"subs %[chunk], #1			    \n"
 			"vmovl.s16 q3, d3			    \n"
 			"vmovl.s16 q2, d2			    \n"
 			"vmovl.s16 q1, d1			    \n"
 			"vmovl.s16 q0, d0			    \n"
 			"vcvt.f32.s32 q3, q3, #15		    \n"
 			"vcvt.f32.s32 q2, q2, #15		    \n"
 			"vcvt.f32.s32 q1, q1, #15		    \n"
 			"vcvt.f32.s32 q0, q0, #15		    \n"
 			"vst1.32 {d4-d7}, [%[output2]]!		    \n"
 			"vst1.32 {d0-d3}, [%[output1]]!		    \n"
 			"bne 1b					    \n"
 			: /* output */
 			  [chunk]"+r"(chunk),
 			  [input]"+r"(input),
 			  [output1]"+r"(output1),
 			  [output2]"+r"(output2)
 			: /* input */
 			: /* clobber */
 			  "q0", "q1", "q2", "q3", "memory", "cc");
 		// clang-format on
 	}

 	/* The remaining samples. */
 	while (frames--) {
 		*output1++ = *input++ / 32768.0f;
 		*output2++ = *input++ / 32768.0f;
 	}
 }
 #define deinterleave_stereo deinterleave_stereo

 /* Converts floats in range of -1.0f to 1.0f to shorts in range of
  * -32768 to 32767 with rounding to nearest, with ties (0.5) rounding away
  * from zero.
  * Rounding is achieved by adding 0.5 or -0.5 adjusted for fixed point
  * precision, and then converting float to fixed point using vcvt instruction
  * which truncated toward zero.
  * For very large values, beyond +/- 2 billion, vcvt will clamp the result
  * to the min or max value that fits an int.
  * For other values, vqmovn clamps the output to -32768 to 32767 range.
  */
 static void interleave_stereo(float *input1, float *input2, int16_t *output,
 			      int frames)
 {
 	/* Process 4 frames (8 samples) each loop. */
 	/* L0 L1 L2 L3, R0 R1 R2 R3 -> L0 R0 L1 R1, L2 R2 L3 R3 */
 	float32x4_t pos = vdupq_n_f32(0.5f / 32768.0f);
 	float32x4_t neg = vdupq_n_f32(-0.5f / 32768.0f);
 	int chunk = frames >> 2;
 	frames &= 3;

 	if (chunk) {
 		// clang-format off
 		__asm__ __volatile__(
 			"veor q0, q0, q0			    \n"
 			"1:					    \n"
 			"vld1.32 {d2-d3}, [%[input1]]!		    \n"
 			"vld1.32 {d4-d5}, [%[input2]]!		    \n"
 			"subs %[chunk], #1			    \n"
 			/* We try to round to the nearest number by adding 0.5
 			 * to positive input, and adding -0.5 to the negative
 			 * input, then truncate.
 			 */
 			"vcgt.f32 q3, q1, q0			    \n"
 			"vcgt.f32 q4, q2, q0			    \n"
 			"vbsl q3, %q[pos], %q[neg]		    \n"
 			"vbsl q4, %q[pos], %q[neg]		    \n"
 			"vadd.f32 q1, q1, q3			    \n"
 			"vadd.f32 q2, q2, q4			    \n"
 			"vcvt.s32.f32 q1, q1, #15		    \n"
 			"vcvt.s32.f32 q2, q2, #15		    \n"
 			"vqmovn.s32 d2, q1			    \n"
 			"vqmovn.s32 d3, q2			    \n"
 			"vst2.16 {d2-d3}, [%[output]]!		    \n"
 			"bne 1b					    \n"
 			: /* output */
 			  [chunk]"+r"(chunk),
 			  [input1]"+r"(input1),
 			  [input2]"+r"(input2),
 			  [output]"+r"(output)
 			: /* input */
 			  [pos]"w"(pos),
 			  [neg]"w"(neg)
 			: /* clobber */
 			  "q0", "q1", "q2", "q3", "q4", "memory", "cc");
 		// clang-format on
 	}

 	/* The remaining samples */
 	while (frames--) {
 		float f;
 		f = *input1++ * 32768.0f;
 		f += (f >= 0) ? 0.5f : -0.5f;
 		*output++ = max(-32768, min(32767, (int)(f)));
 		f = *input2++ * 32768.0f;
 		f += (f >= 0) ? 0.5f : -0.5f;
 		*output++ = max(-32768, min(32767, (int)(f)));
 	}
 }
 #define interleave_stereo interleave_stereo
 #endif

 #ifdef __SSE3__
 #include <emmintrin.h>

 /* Converts shorts in range of -32768 to 32767 to floats in range of
  * -1.0f to 1.0f.
  * pslld and psrad shifts are used to isolate the low and high word, but
  * each in a different range:
  * The low word is shifted to the high bits in range 0x80000000 .. 0x7fff0000.
  * The high word is shifted to the low bits in range 0x00008000 .. 0x00007fff.
  * cvtdq2ps converts ints to floats as is.
  * mulps is used to normalize the range of the low and high words, adjusting
  * for high and low words being in different range.
  */
 static void deinterleave_stereo(int16_t *input, float *output1, float *output2,
 				int frames)
 {
 	/* Process 8 frames (16 samples) each loop. */
 	/* L0 R0 L1 R1 L2 R2 L3 R3... -> L0 L1 L2 L3... R0 R1 R2 R3... */
 	int chunk = frames >> 3;
 	frames &= 7;
 	if (chunk) {
 		// clang-format off
 		__asm__ __volatile__(
 			"1:                                         \n"
 			"lddqu (%[input]), %%xmm0                   \n"
 			"lddqu 16(%[input]), %%xmm1                 \n"
 			"add $32, %[input]                          \n"
 			"movdqa %%xmm0, %%xmm2                      \n"
 			"movdqa %%xmm1, %%xmm3                      \n"
 			"pslld $16, %%xmm0                          \n"
 			"pslld $16, %%xmm1                          \n"
 			"psrad $16, %%xmm2                          \n"
 			"psrad $16, %%xmm3                          \n"
 			"cvtdq2ps %%xmm0, %%xmm0                    \n"
 			"cvtdq2ps %%xmm1, %%xmm1                    \n"
 			"cvtdq2ps %%xmm2, %%xmm2                    \n"
 			"cvtdq2ps %%xmm3, %%xmm3                    \n"
 			"mulps %[scale_2_n31], %%xmm0               \n"
 			"mulps %[scale_2_n31], %%xmm1               \n"
 			"mulps %[scale_2_n15], %%xmm2               \n"
 			"mulps %[scale_2_n15], %%xmm3               \n"
 			"movdqu %%xmm0, (%[output1])                \n"
 			"movdqu %%xmm1, 16(%[output1])              \n"
 			"movdqu %%xmm2, (%[output2])                \n"
 			"movdqu %%xmm3, 16(%[output2])              \n"
 			"add $32, %[output1]                        \n"
 			"add $32, %[output2]                        \n"
 			"sub $1, %[chunk]                           \n"
 			"jnz 1b                                     \n"
 			: /* output */
 			  [chunk]"+r"(chunk),
 			  [input]"+r"(input),
 			  [output1]"+r"(output1),
 			  [output2]"+r"(output2)
 			: /* input */
 			  [scale_2_n31]"x"(_mm_set1_ps(1.0f/(1<<15)/(1<<16))),
 			  [scale_2_n15]"x"(_mm_set1_ps(1.0f/(1<<15)))
 			: /* clobber */
 			  "xmm0", "xmm1", "xmm2", "xmm3", "memory", "cc");
 		// clang-format on
 	}

 	/* The remaining samples. */
 	while (frames--) {
 		*output1++ = *input++ / 32768.0f;
 		*output2++ = *input++ / 32768.0f;
 	}
 }
 #define deinterleave_stereo deinterleave_stereo

 /* Converts floats in range of -1.0f to 1.0f to shorts in range of
  * -32768 to 32767 with rounding to nearest, with ties (0.5) rounding to
  * even.
  * For very large values, beyond +/- 2 billion, cvtps2dq will produce
  * 0x80000000 and packssdw will clamp -32768.
  */
 static void interleave_stereo(float *input1, float *input2, int16_t *output,
 			      int frames)
 {
 	/* Process 4 frames (8 samples) each loop. */
 	/* L0 L1 L2 L3, R0 R1 R2 R3 -> L0 R0 L1 R1, L2 R2 L3 R3 */
 	int chunk = frames >> 2;
 	frames &= 3;

 	if (chunk) {
 		// clang-format off
 		__asm__ __volatile__(
 			"1:                                         \n"
 			"lddqu (%[input1]), %%xmm0                  \n"
 			"lddqu (%[input2]), %%xmm2                  \n"
 			"add $16, %[input1]                         \n"
 			"add $16, %[input2]                         \n"
 			"movaps %%xmm0, %%xmm1                      \n"
 			"unpcklps %%xmm2, %%xmm0                    \n"
 			"unpckhps %%xmm2, %%xmm1                    \n"
 			"paddsw %[scale_2_15], %%xmm0               \n"
 			"paddsw %[scale_2_15], %%xmm1               \n"
 			"cvtps2dq %%xmm0, %%xmm0                    \n"
 			"cvtps2dq %%xmm1, %%xmm1                    \n"
 			"packssdw %%xmm1, %%xmm0                    \n"
 			"movdqu %%xmm0, (%[output])                 \n"
 			"add $16, %[output]                         \n"
 			"sub $1, %[chunk]                           \n"
 			"jnz 1b                                     \n"
 			: /* output */
 			  [chunk]"+r"(chunk),
 			  [input1]"+r"(input1),
 			  [input2]"+r"(input2),
 			  [output]"+r"(output)
 			: /* input */
 			  [scale_2_15]"x"(_mm_set1_epi32(15 << 23)),
 			  [clamp_large]"x"(_mm_set1_ps(32767.0f))
 			: /* clobber */
 			  "xmm0", "xmm1", "xmm2", "memory", "cc");
 		// clang-format on
 	}

 	/* The remaining samples */
 	while (frames--) {
 		float f;
 		f = *input1++ * 32768.0f;
 		f += (f >= 0) ? 0.5f : -0.5f;
 		*output++ = max(-32768, min(32767, (int)(f)));
 		f = *input2++ * 32768.0f;
 		f += (f >= 0) ? 0.5f : -0.5f;
 		*output++ = max(-32768, min(32767, (int)(f)));
 	}
 }
 #define interleave_stereo interleave_stereo
 #endif

 static void dsp_util_deinterleave_s16le(int16_t *input, float *const *output,
 					int channels, int frames)
 {
 	float *output_ptr[channels];
 	int i, j;

 #ifdef deinterleave_stereo
 	if (channels == 2) {
 		deinterleave_stereo(input, output[0], output[1], frames);
 		return;
 	}
 #endif

 	for (i = 0; i < channels; i++)
 		output_ptr[i] = output[i];

 	for (i = 0; i < frames; i++)
 		for (j = 0; j < channels; j++)
 			*(output_ptr[j]++) = *input++ / 32768.0f;
 }

 static void dsp_util_deinterleave_s24le(int32_t *input, float *const *output,
 					int channels, int frames)
 {
 	float *output_ptr[channels];
 	int i, j;

 	for (i = 0; i < channels; i++)
 		output_ptr[i] = output[i];

 	for (i = 0; i < frames; i++)
 		for (j = 0; j < channels; j++, input++)
 			*(output_ptr[j]++) = (*input << 8) / 2147483648.0f;
 }

 static void dsp_util_deinterleave_s243le(uint8_t *input, float *const *output,
 					 int channels, int frames)
 {
 	float *output_ptr[channels];
 	int32_t sample;
 	int i, j;

 	for (i = 0; i < channels; i++)
 		output_ptr[i] = output[i];

 	for (i = 0; i < frames; i++)
 		for (j = 0; j < channels; j++, input += 3) {
 			sample = 0;
 			memcpy((uint8_t *)&sample + 1, input, 3);
 			*(output_ptr[j]++) = sample / 2147483648.0f;
 		}
 }

 static void dsp_util_deinterleave_s32le(int32_t *input, float *const *output,
 					int channels, int frames)
 {
 	float *output_ptr[channels];
 	int i, j;

 	for (i = 0; i < channels; i++)
 		output_ptr[i] = output[i];

 	for (i = 0; i < frames; i++)
 		for (j = 0; j < channels; j++, input++)
 			*(output_ptr[j]++) = *input / 2147483648.0f;
 }

 int dsp_util_deinterleave(uint8_t *input, float *const *output, int channels,
 			  snd_pcm_format_t format, int frames)
 {
 	switch (format) {
 	case SND_PCM_FORMAT_S16_LE:
 		dsp_util_deinterleave_s16le((int16_t *)input, output, channels,
 					    frames);
 		break;
 	case SND_PCM_FORMAT_S24_LE:
 		dsp_util_deinterleave_s24le((int32_t *)input, output, channels,
 					    frames);
 		break;
 	case SND_PCM_FORMAT_S24_3LE:
 		dsp_util_deinterleave_s243le(input, output, channels, frames);
 		break;
 	case SND_PCM_FORMAT_S32_LE:
 		dsp_util_deinterleave_s32le((int32_t *)input, output, channels,
 					    frames);
 		break;
 	default:
 		syslog(LOG_ERR, "Invalid format to deinterleave");
 		return -EINVAL;
 	}
 	return 0;
 }

 static void dsp_util_interleave_s16le(float *const *input, int16_t *output,
 				      int channels, int frames)
 {
 	float *input_ptr[channels];
 	int i, j;

 #ifdef interleave_stereo
 	if (channels == 2) {
 		interleave_stereo(input[0], input[1], output, frames);
 		return;
 	}
 #endif

 	for (i = 0; i < channels; i++)
 		input_ptr[i] = input[i];

 	for (i = 0; i < frames; i++)
 		for (j = 0; j < channels; j++) {
 			float f = *(input_ptr[j]++) * 32768.0f;
 			f += (f >= 0) ? 0.5f : -0.5f;
 			*output++ = max(-32768, min(32767, (int)(f)));
 		}
 }

 static void dsp_util_interleave_s24le(float *const *input, int32_t *output,
 				      int channels, int frames)
 {
 	float *input_ptr[channels];
 	int i, j;

 	for (i = 0; i < channels; i++)
 		input_ptr[i] = input[i];

 	for (i = 0; i < frames; i++)
 		for (j = 0; j < channels; j++, output++) {
 			float f = *(input_ptr[j]++) * 2147483648.0f;
 			f += (f >= 0) ? 0.5f : -0.5f;
 			*output = max((float)INT_MIN, min((float)INT_MAX, f));
 			*output = (*output >> 8) & 0x00ffffff;
 		}
 }

 static void dsp_util_interleave_s243le(float *const *input, uint8_t *output,
 				       int channels, int frames)
 {
 	float *input_ptr[channels];
 	int i, j;
 	int32_t tmp;

 	for (i = 0; i < channels; i++)
 		input_ptr[i] = input[i];

 	for (i = 0; i < frames; i++)
 		for (j = 0; j < channels; j++, output += 3) {
 			float f = *(input_ptr[j]++) * 2147483648.0f;
 			f += (f >= 0) ? 0.5f : -0.5f;
 			tmp = max((float)INT_MIN, min((float)INT_MAX, f));
 			tmp >>= 8;
 			memcpy(output, &tmp, 3);
 		}
 }

 static void dsp_util_interleave_s32le(float *const *input, int32_t *output,
 				      int channels, int frames)
 {
 	float *input_ptr[channels];
 	int i, j;

 	for (i = 0; i < channels; i++)
 		input_ptr[i] = input[i];

 	for (i = 0; i < frames; i++)
 		for (j = 0; j < channels; j++, output++) {
 			float f = *(input_ptr[j]++) * 2147483648.0f;
 			f += (f >= 0) ? 0.5f : -0.5f;
 			*output = max((float)INT_MIN, min((float)INT_MAX, f));
 		}
 }

 int dsp_util_interleave(float *const *input, uint8_t *output, int channels,
 			snd_pcm_format_t format, int frames)
 {
 	switch (format) {
 	case SND_PCM_FORMAT_S16_LE:
 		dsp_util_interleave_s16le(input, (int16_t *)output, channels,
 					  frames);
 		break;
 	case SND_PCM_FORMAT_S24_LE:
 		dsp_util_interleave_s24le(input, (int32_t *)output, channels,
 					  frames);
 		break;
 	case SND_PCM_FORMAT_S24_3LE:
 		dsp_util_interleave_s243le(input, output, channels, frames);
 		break;
 	case SND_PCM_FORMAT_S32_LE:
 		dsp_util_interleave_s32le(input, (int32_t *)output, channels,
 					  frames);
 		break;
 	default:
 		syslog(LOG_ERR, "Invalid format to interleave");
 		return -EINVAL;
 	}
 	return 0;
 }

 void dsp_enable_flush_denormal_to_zero()
 {
 #if defined(__i386__) || defined(__x86_64__)
 	unsigned int mxcsr;
 	mxcsr = __builtin_ia32_stmxcsr();
 	__builtin_ia32_ldmxcsr(mxcsr | 0x8040);
 #elif defined(__aarch64__)
 	uint64_t cw;
 	__asm__ __volatile__("mrs    %0, fpcr			    \n"
 			     "orr    %0, %0, #0x1000000		    \n"
 			     "msr    fpcr, %0			    \n"
 			     "isb					    \n"
 			     : "=r"(cw)::"memory");
 #elif defined(__arm__)
 	uint32_t cw;
 	__asm__ __volatile__("vmrs   %0, fpscr			    \n"
 			     "orr    %0, %0, #0x1000000		    \n"
 			     "vmsr   fpscr, %0			    \n"
 			     : "=r"(cw)::"memory");
 #else
 #warning "Don't know how to disable denorms. Performace may suffer."
 #endif
 }
	/* Copyright 2013 The Chromium OS Authors. All rights reserved.
	* Use of this source code is governed by a BSD-style license that can be
	* found in the LICENSE file.
	*/

	#include <limits.h>
	#include <syslog.h>

	#include "dsp_util.h"

	#ifndef max
	#define max(a, b) \
	({ \
	__typeof__(a) _a = (a); \
	__typeof__(b) _b = (b); \
	_a > _b ? _a : _b; \
	})
	#endif

	#ifndef min
	#define min(a, b) \
	({ \
	__typeof__(a) _a = (a); \
	__typeof__(b) _b = (b); \
	_a < _b ? _a : _b; \
	})
	#endif

	#undef deinterleave_stereo
	#undef interleave_stereo

	/* Converts shorts in range of -32768 to 32767 to floats in range of
	* -1.0f to 1.0f.
	* scvtf instruction accepts fixed point ints, so sxtl is used to lengthen
	* shorts to int with sign extension.
	*/
	#ifdef __aarch64__
	static void deinterleave_stereo(int16_t input, float output1, float *output2,
	int frames)
	{
	int chunk = frames >> 3;
	frames &= 7;
	/* Process 8 frames (16 samples) each loop. */
	/* L0 R0 L1 R1 L2 R2 L3 R3... -> L0 L1 L2 L3... R0 R1 R2 R3... */
	if (chunk) {
	// clang-format off
	__asm__ __volatile__(
	"1: \n"
	"ld2 {v2.8h, v3.8h}, [%[input]], #32 \n"
	"subs %w[chunk], %w[chunk], #1 \n"
	"sxtl v0.4s, v2.4h \n"
	"sxtl2 v1.4s, v2.8h \n"
	"sxtl v2.4s, v3.4h \n"
	"sxtl2 v3.4s, v3.8h \n"
	"scvtf v0.4s, v0.4s, #15 \n"
	"scvtf v1.4s, v1.4s, #15 \n"
	"scvtf v2.4s, v2.4s, #15 \n"
	"scvtf v3.4s, v3.4s, #15 \n"
	"st1 {v0.4s, v1.4s}, [%[output1]], #32 \n"
	"st1 {v2.4s, v3.4s}, [%[output2]], #32 \n"
	"b.ne 1b \n"
	: /* output */
	[chunk]"+r"(chunk),
	[input]"+r"(input),
	[output1]"+r"(output1),
	[output2]"+r"(output2)
	: /* input */
	: /* clobber */
	"v0", "v1", "v2", "v3", "memory", "cc");
	// clang-format on
	}

	/* The remaining samples. */
	while (frames--) {
	output1++ = input++ / 32768.0f;
	output2++ = input++ / 32768.0f;
	}
	}
	#define deinterleave_stereo deinterleave_stereo

	/* Converts floats in range of -1.0f to 1.0f to shorts in range of
	* -32768 to 32767 with rounding to nearest, with ties (0.5) rounding away
	* from zero.
	* Rounding is achieved by using fcvtas instruction. (a = away)
	* The float scaled to a range of -32768 to 32767 by adding 15 to the exponent.
	* Add to exponent is equivalent to multiply for exponent range of 0 to 239,
	* which is 2.59 * 10^33. A signed saturating add (sqadd) limits exponents
	* from 240 to 255 to clamp to 255.
	* For very large values, beyond +/- 2 billion, fcvtas will clamp the result
	* to the min or max value that fits an int.
	* For other values, sqxtn clamps the output to -32768 to 32767 range.
	*/
	static void interleave_stereo(float input1, float input2, int16_t *output,
	int frames)
	{
	/* Process 4 frames (8 samples) each loop. */
	/* L0 L1 L2 L3, R0 R1 R2 R3 -> L0 R0 L1 R1, L2 R2 L3 R3 */
	int chunk = frames >> 2;
	frames &= 3;

	if (chunk) {
	// clang-format off
	__asm__ __volatile__(
	"dup v2.4s, %w[scale] \n"
	"1: \n"
	"ld1 {v0.4s}, [%[input1]], #16 \n"
	"ld1 {v1.4s}, [%[input2]], #16 \n"
	"subs %w[chunk], %w[chunk], #1 \n"
	"sqadd v0.4s, v0.4s, v2.4s \n"
	"sqadd v1.4s, v1.4s, v2.4s \n"
	"fcvtas v0.4s, v0.4s \n"
	"fcvtas v1.4s, v1.4s \n"
	"sqxtn v0.4h, v0.4s \n"
	"sqxtn v1.4h, v1.4s \n"
	"st2 {v0.4h, v1.4h}, [%[output]], #16 \n"
	"b.ne 1b \n"
	: /* output */
	[chunk]"+r"(chunk),
	[input1]"+r"(input1),
	[input2]"+r"(input2),
	[output]"+r"(output)
	: /* input */
	[scale]"r"(15 << 23)
	: /* clobber */
	"v0", "v1", "v2", "memory", "cc");
	// clang-format on
	}

	/* The remaining samples */
	while (frames--) {
	float f;
	f = input1++ 32768.0f;
	f += (f >= 0) ? 0.5f : -0.5f;
	*output++ = max(-32768, min(32767, (int)(f)));
	f = input2++ 32768.0f;
	f += (f >= 0) ? 0.5f : -0.5f;
	*output++ = max(-32768, min(32767, (int)(f)));
	}
	}
	#define interleave_stereo interleave_stereo
	#endif

	#ifdef __ARM_NEON__
	#include <arm_neon.h>

	static void deinterleave_stereo(int16_t input, float output1, float *output2,
	int frames)
	{
	/* Process 8 frames (16 samples) each loop. */
	/* L0 R0 L1 R1 L2 R2 L3 R3... -> L0 L1 L2 L3... R0 R1 R2 R3... */
	int chunk = frames >> 3;
	frames &= 7;
	if (chunk) {
	// clang-format off
	__asm__ __volatile__(
	"1: \n"
	"vld2.16 {d0-d3}, [%[input]]! \n"
	"subs %[chunk], #1 \n"
	"vmovl.s16 q3, d3 \n"
	"vmovl.s16 q2, d2 \n"
	"vmovl.s16 q1, d1 \n"
	"vmovl.s16 q0, d0 \n"
	"vcvt.f32.s32 q3, q3, #15 \n"
	"vcvt.f32.s32 q2, q2, #15 \n"
	"vcvt.f32.s32 q1, q1, #15 \n"
	"vcvt.f32.s32 q0, q0, #15 \n"
	"vst1.32 {d4-d7}, [%[output2]]! \n"
	"vst1.32 {d0-d3}, [%[output1]]! \n"
	"bne 1b \n"
	: /* output */
	[chunk]"+r"(chunk),
	[input]"+r"(input),
	[output1]"+r"(output1),
	[output2]"+r"(output2)
	: /* input */
	: /* clobber */
	"q0", "q1", "q2", "q3", "memory", "cc");
	// clang-format on
	}

	/* The remaining samples. */
	while (frames--) {
	output1++ = input++ / 32768.0f;
	output2++ = input++ / 32768.0f;
	}
	}
	#define deinterleave_stereo deinterleave_stereo

	/* Converts floats in range of -1.0f to 1.0f to shorts in range of
	* -32768 to 32767 with rounding to nearest, with ties (0.5) rounding away
	* from zero.
	* Rounding is achieved by adding 0.5 or -0.5 adjusted for fixed point
	* precision, and then converting float to fixed point using vcvt instruction
	* which truncated toward zero.
	* For very large values, beyond +/- 2 billion, vcvt will clamp the result
	* to the min or max value that fits an int.
	* For other values, vqmovn clamps the output to -32768 to 32767 range.
	*/
	static void interleave_stereo(float input1, float input2, int16_t *output,
	int frames)
	{
	/* Process 4 frames (8 samples) each loop. */
	/* L0 L1 L2 L3, R0 R1 R2 R3 -> L0 R0 L1 R1, L2 R2 L3 R3 */
	float32x4_t pos = vdupq_n_f32(0.5f / 32768.0f);
	float32x4_t neg = vdupq_n_f32(-0.5f / 32768.0f);
	int chunk = frames >> 2;
	frames &= 3;

	if (chunk) {
	// clang-format off
	__asm__ __volatile__(
	"veor q0, q0, q0 \n"
	"1: \n"
	"vld1.32 {d2-d3}, [%[input1]]! \n"
	"vld1.32 {d4-d5}, [%[input2]]! \n"
	"subs %[chunk], #1 \n"
	/* We try to round to the nearest number by adding 0.5
	* to positive input, and adding -0.5 to the negative
	* input, then truncate.
	*/
	"vcgt.f32 q3, q1, q0 \n"
	"vcgt.f32 q4, q2, q0 \n"
	"vbsl q3, %q[pos], %q[neg] \n"
	"vbsl q4, %q[pos], %q[neg] \n"
	"vadd.f32 q1, q1, q3 \n"
	"vadd.f32 q2, q2, q4 \n"
	"vcvt.s32.f32 q1, q1, #15 \n"
	"vcvt.s32.f32 q2, q2, #15 \n"
	"vqmovn.s32 d2, q1 \n"
	"vqmovn.s32 d3, q2 \n"
	"vst2.16 {d2-d3}, [%[output]]! \n"
	"bne 1b \n"
	: /* output */
	[chunk]"+r"(chunk),
	[input1]"+r"(input1),
	[input2]"+r"(input2),
	[output]"+r"(output)
	: /* input */
	[pos]"w"(pos),
	[neg]"w"(neg)
	: /* clobber */
	"q0", "q1", "q2", "q3", "q4", "memory", "cc");
	// clang-format on
	}

	/* The remaining samples */
	while (frames--) {
	float f;
	f = input1++ 32768.0f;
	f += (f >= 0) ? 0.5f : -0.5f;
	*output++ = max(-32768, min(32767, (int)(f)));
	f = input2++ 32768.0f;
	f += (f >= 0) ? 0.5f : -0.5f;
	*output++ = max(-32768, min(32767, (int)(f)));
	}
	}
	#define interleave_stereo interleave_stereo
	#endif

	#ifdef __SSE3__
	#include <emmintrin.h>

	/* Converts shorts in range of -32768 to 32767 to floats in range of
	* -1.0f to 1.0f.
	* pslld and psrad shifts are used to isolate the low and high word, but
	* each in a different range:
	* The low word is shifted to the high bits in range 0x80000000 .. 0x7fff0000.
	* The high word is shifted to the low bits in range 0x00008000 .. 0x00007fff.
	* cvtdq2ps converts ints to floats as is.
	* mulps is used to normalize the range of the low and high words, adjusting
	* for high and low words being in different range.
	*/
	static void deinterleave_stereo(int16_t input, float output1, float *output2,
	int frames)
	{
	/* Process 8 frames (16 samples) each loop. */
	/* L0 R0 L1 R1 L2 R2 L3 R3... -> L0 L1 L2 L3... R0 R1 R2 R3... */
	int chunk = frames >> 3;
	frames &= 7;
	if (chunk) {
	// clang-format off
	__asm__ __volatile__(
	"1: \n"
	"lddqu (%[input]), %%xmm0 \n"
	"lddqu 16(%[input]), %%xmm1 \n"
	"add $32, %[input] \n"
	"movdqa %%xmm0, %%xmm2 \n"
	"movdqa %%xmm1, %%xmm3 \n"
	"pslld $16, %%xmm0 \n"
	"pslld $16, %%xmm1 \n"
	"psrad $16, %%xmm2 \n"
	"psrad $16, %%xmm3 \n"
	"cvtdq2ps %%xmm0, %%xmm0 \n"
	"cvtdq2ps %%xmm1, %%xmm1 \n"
	"cvtdq2ps %%xmm2, %%xmm2 \n"
	"cvtdq2ps %%xmm3, %%xmm3 \n"
	"mulps %[scale_2_n31], %%xmm0 \n"
	"mulps %[scale_2_n31], %%xmm1 \n"
	"mulps %[scale_2_n15], %%xmm2 \n"
	"mulps %[scale_2_n15], %%xmm3 \n"
	"movdqu %%xmm0, (%[output1]) \n"
	"movdqu %%xmm1, 16(%[output1]) \n"
	"movdqu %%xmm2, (%[output2]) \n"
	"movdqu %%xmm3, 16(%[output2]) \n"
	"add $32, %[output1] \n"
	"add $32, %[output2] \n"
	"sub $1, %[chunk] \n"
	"jnz 1b \n"
	: /* output */
	[chunk]"+r"(chunk),
	[input]"+r"(input),
	[output1]"+r"(output1),
	[output2]"+r"(output2)
	: /* input */
	[scale_2_n31]"x"(_mm_set1_ps(1.0f/(1<<15)/(1<<16))),
	[scale_2_n15]"x"(_mm_set1_ps(1.0f/(1<<15)))
	: /* clobber */
	"xmm0", "xmm1", "xmm2", "xmm3", "memory", "cc");
	// clang-format on
	}

	/* The remaining samples. */
	while (frames--) {
	output1++ = input++ / 32768.0f;
	output2++ = input++ / 32768.0f;
	}
	}
	#define deinterleave_stereo deinterleave_stereo

	/* Converts floats in range of -1.0f to 1.0f to shorts in range of
	* -32768 to 32767 with rounding to nearest, with ties (0.5) rounding to
	* even.
	* For very large values, beyond +/- 2 billion, cvtps2dq will produce
	* 0x80000000 and packssdw will clamp -32768.
	*/
	static void interleave_stereo(float input1, float input2, int16_t *output,
	int frames)
	{
	/* Process 4 frames (8 samples) each loop. */
	/* L0 L1 L2 L3, R0 R1 R2 R3 -> L0 R0 L1 R1, L2 R2 L3 R3 */
	int chunk = frames >> 2;
	frames &= 3;

	if (chunk) {
	// clang-format off
	__asm__ __volatile__(
	"1: \n"
	"lddqu (%[input1]), %%xmm0 \n"
	"lddqu (%[input2]), %%xmm2 \n"
	"add $16, %[input1] \n"
	"add $16, %[input2] \n"
	"movaps %%xmm0, %%xmm1 \n"
	"unpcklps %%xmm2, %%xmm0 \n"
	"unpckhps %%xmm2, %%xmm1 \n"
	"paddsw %[scale_2_15], %%xmm0 \n"
	"paddsw %[scale_2_15], %%xmm1 \n"
	"cvtps2dq %%xmm0, %%xmm0 \n"
	"cvtps2dq %%xmm1, %%xmm1 \n"
	"packssdw %%xmm1, %%xmm0 \n"
	"movdqu %%xmm0, (%[output]) \n"
	"add $16, %[output] \n"
	"sub $1, %[chunk] \n"
	"jnz 1b \n"
	: /* output */
	[chunk]"+r"(chunk),
	[input1]"+r"(input1),
	[input2]"+r"(input2),
	[output]"+r"(output)
	: /* input */
	[scale_2_15]"x"(_mm_set1_epi32(15 << 23)),
	[clamp_large]"x"(_mm_set1_ps(32767.0f))
	: /* clobber */
	"xmm0", "xmm1", "xmm2", "memory", "cc");
	// clang-format on
	}

	/* The remaining samples */
	while (frames--) {
	float f;
	f = input1++ 32768.0f;
	f += (f >= 0) ? 0.5f : -0.5f;
	*output++ = max(-32768, min(32767, (int)(f)));
	f = input2++ 32768.0f;
	f += (f >= 0) ? 0.5f : -0.5f;
	*output++ = max(-32768, min(32767, (int)(f)));
	}
	}
	#define interleave_stereo interleave_stereo
	#endif

	static void dsp_util_deinterleave_s16le(int16_t input, float const *output,
	int channels, int frames)
	{
	float *output_ptr[channels];
	int i, j;

	#ifdef deinterleave_stereo
	if (channels == 2) {
	deinterleave_stereo(input, output[0], output[1], frames);
	return;
	}
	#endif

	for (i = 0; i < channels; i++)
	output_ptr[i] = output[i];

	for (i = 0; i < frames; i++)
	for (j = 0; j < channels; j++)
	(output_ptr[j]++) = input++ / 32768.0f;
	}

	static void dsp_util_deinterleave_s24le(int32_t input, float const *output,
	int channels, int frames)
	{
	float *output_ptr[channels];
	int i, j;

	for (i = 0; i < channels; i++)
	output_ptr[i] = output[i];

	for (i = 0; i < frames; i++)
	for (j = 0; j < channels; j++, input++)
	(output_ptr[j]++) = (input << 8) / 2147483648.0f;
	}

	static void dsp_util_deinterleave_s243le(uint8_t input, float const *output,
	int channels, int frames)
	{
	float *output_ptr[channels];
	int32_t sample;
	int i, j;

	for (i = 0; i < channels; i++)
	output_ptr[i] = output[i];

	for (i = 0; i < frames; i++)
	for (j = 0; j < channels; j++, input += 3) {
	sample = 0;
	memcpy((uint8_t *)&sample + 1, input, 3);
	*(output_ptr[j]++) = sample / 2147483648.0f;
	}
	}

	static void dsp_util_deinterleave_s32le(int32_t input, float const *output,
	int channels, int frames)
	{
	float *output_ptr[channels];
	int i, j;

	for (i = 0; i < channels; i++)
	output_ptr[i] = output[i];

	for (i = 0; i < frames; i++)
	for (j = 0; j < channels; j++, input++)
	(output_ptr[j]++) = input / 2147483648.0f;
	}

	int dsp_util_deinterleave(uint8_t input, float const *output, int channels,
	snd_pcm_format_t format, int frames)
	{
	switch (format) {
	case SND_PCM_FORMAT_S16_LE:
	dsp_util_deinterleave_s16le((int16_t *)input, output, channels,
	frames);
	break;
	case SND_PCM_FORMAT_S24_LE:
	dsp_util_deinterleave_s24le((int32_t *)input, output, channels,
	frames);
	break;
	case SND_PCM_FORMAT_S24_3LE:
	dsp_util_deinterleave_s243le(input, output, channels, frames);
	break;
	case SND_PCM_FORMAT_S32_LE:
	dsp_util_deinterleave_s32le((int32_t *)input, output, channels,
	frames);
	break;
	default:
	syslog(LOG_ERR, "Invalid format to deinterleave");
	return -EINVAL;
	}
	return 0;
	}

	static void dsp_util_interleave_s16le(float const input, int16_t *output,
	int channels, int frames)
	{
	float *input_ptr[channels];
	int i, j;

	#ifdef interleave_stereo
	if (channels == 2) {
	interleave_stereo(input[0], input[1], output, frames);
	return;
	}
	#endif

	for (i = 0; i < channels; i++)
	input_ptr[i] = input[i];

	for (i = 0; i < frames; i++)
	for (j = 0; j < channels; j++) {
	float f = (input_ptr[j]++) 32768.0f;
	f += (f >= 0) ? 0.5f : -0.5f;
	*output++ = max(-32768, min(32767, (int)(f)));
	}
	}

	static void dsp_util_interleave_s24le(float const input, int32_t *output,
	int channels, int frames)
	{
	float *input_ptr[channels];
	int i, j;

	for (i = 0; i < channels; i++)
	input_ptr[i] = input[i];

	for (i = 0; i < frames; i++)
	for (j = 0; j < channels; j++, output++) {
	float f = (input_ptr[j]++) 2147483648.0f;
	f += (f >= 0) ? 0.5f : -0.5f;
	*output = max((float)INT_MIN, min((float)INT_MAX, f));
	output = (output >> 8) & 0x00ffffff;
	}
	}

	static void dsp_util_interleave_s243le(float const input, uint8_t *output,
	int channels, int frames)
	{
	float *input_ptr[channels];
	int i, j;
	int32_t tmp;

	for (i = 0; i < channels; i++)
	input_ptr[i] = input[i];

	for (i = 0; i < frames; i++)
	for (j = 0; j < channels; j++, output += 3) {
	float f = (input_ptr[j]++) 2147483648.0f;
	f += (f >= 0) ? 0.5f : -0.5f;
	tmp = max((float)INT_MIN, min((float)INT_MAX, f));
	tmp >>= 8;
	memcpy(output, &tmp, 3);
	}
	}

	static void dsp_util_interleave_s32le(float const input, int32_t *output,
	int channels, int frames)
	{
	float *input_ptr[channels];
	int i, j;

	for (i = 0; i < channels; i++)
	input_ptr[i] = input[i];

	for (i = 0; i < frames; i++)
	for (j = 0; j < channels; j++, output++) {
	float f = (input_ptr[j]++) 2147483648.0f;
	f += (f >= 0) ? 0.5f : -0.5f;
	*output = max((float)INT_MIN, min((float)INT_MAX, f));
	}
	}

	int dsp_util_interleave(float const input, uint8_t *output, int channels,
	snd_pcm_format_t format, int frames)
	{
	switch (format) {
	case SND_PCM_FORMAT_S16_LE:
	dsp_util_interleave_s16le(input, (int16_t *)output, channels,
	frames);
	break;
	case SND_PCM_FORMAT_S24_LE:
	dsp_util_interleave_s24le(input, (int32_t *)output, channels,
	frames);
	break;
	case SND_PCM_FORMAT_S24_3LE:
	dsp_util_interleave_s243le(input, output, channels, frames);
	break;
	case SND_PCM_FORMAT_S32_LE:
	dsp_util_interleave_s32le(input, (int32_t *)output, channels,
	frames);
	break;
	default:
	syslog(LOG_ERR, "Invalid format to interleave");
	return -EINVAL;
	}
	return 0;
	}

	void dsp_enable_flush_denormal_to_zero()
	{
	#if defined(__i386__) \|\| defined(__x86_64__)
	unsigned int mxcsr;
	mxcsr = __builtin_ia32_stmxcsr();
	__builtin_ia32_ldmxcsr(mxcsr \| 0x8040);
	#elif defined(__aarch64__)
	uint64_t cw;
	__asm__ __volatile__("mrs %0, fpcr \n"
	"orr %0, %0, #0x1000000 \n"
	"msr fpcr, %0 \n"
	"isb \n"
	: "=r"(cw)::"memory");
	#elif defined(__arm__)
	uint32_t cw;
	__asm__ __volatile__("vmrs %0, fpscr \n"
	"orr %0, %0, #0x1000000 \n"
	"vmsr fpscr, %0 \n"
	: "=r"(cw)::"memory");
	#else
	#warning "Don't know how to disable denorms. Performace may suffer."
	#endif
	}