1/* aec.cpp
2 *
3 * Copyright (C) DFS Deutsche Flugsicherung (2004, 2005).
4 * All Rights Reserved.
5 *
6 * Acoustic Echo Cancellation NLMS-pw algorithm
7 *
8 * Version 0.3 filter created with www.dsptutor.freeuk.com
9 * Version 0.3.1 Allow change of stability parameter delta
10 * Version 0.4 Leaky Normalized LMS - pre whitening algorithm
11 */
12
13#ifndef _GNU_SOURCE
14#define _GNU_SOURCE
15#endif
16
17#ifdef HAVE_CONFIG_H
18#include <config.h>
19#endif
20
21#include <math.h>
22#include <string.h>
23#include <stdint.h>
24
25#include <pulse/xmalloc.h>
26
27#include "adrian-aec.h"
28
29#ifndef DISABLE_ORC
30#include "adrian-aec-orc-gen.h"
31#endif
32
33#ifdef __SSE__
34#include <xmmintrin.h>
35#endif
36
37/* Vector Dot Product */
38static REAL dotp(REAL a[], REAL b[])
39{
40  REAL sum0 = 0.0f, sum1 = 0.0f;
41  int j;
42
43  for (j = 0; j < NLMS_LEN; j += 2) {
44    // optimize: partial loop unrolling
45    sum0 += a[j] * b[j];
46    sum1 += a[j + 1] * b[j + 1];
47  }
48  return sum0 + sum1;
49}
50
51static REAL dotp_sse(REAL a[], REAL b[])
52{
53#ifdef __SSE__
54  /* This is taken from speex's inner product implementation */
55  int j;
56  REAL sum;
57  __m128 acc = _mm_setzero_ps();
58
59  for (j=0;j<NLMS_LEN;j+=8)
60  {
61    acc = _mm_add_ps(acc, _mm_mul_ps(_mm_load_ps(a+j), _mm_loadu_ps(b+j)));
62    acc = _mm_add_ps(acc, _mm_mul_ps(_mm_load_ps(a+j+4), _mm_loadu_ps(b+j+4)));
63  }
64  acc = _mm_add_ps(acc, _mm_movehl_ps(acc, acc));
65  acc = _mm_add_ss(acc, _mm_shuffle_ps(acc, acc, 0x55));
66  _mm_store_ss(&sum, acc);
67
68  return sum;
69#else
70  return dotp(a, b);
71#endif
72}
73
74
75AEC* AEC_init(int RATE, int have_vector)
76{
77  AEC *a = pa_xnew0(AEC, 1);
78  a->j = NLMS_EXT;
79  AEC_setambient(a, NoiseFloor);
80  a->dfast = a->dslow = M75dB_PCM;
81  a->xfast = a->xslow = M80dB_PCM;
82  a->gain = 1.0f;
83  a->Fx = IIR1_init(2000.0f/RATE);
84  a->Fe = IIR1_init(2000.0f/RATE);
85  a->cutoff = FIR_HP_300Hz_init();
86  a->acMic = IIR_HP_init();
87  a->acSpk = IIR_HP_init();
88
89  a->aes_y2 = M0dB;
90
91  a->fdwdisplay = -1;
92
93  if (have_vector) {
94      /* Get a 16-byte aligned location */
95      a->w = (REAL *) (((uintptr_t) a->w_arr) - (((uintptr_t) a->w_arr) % 16) + 16);
96      a->dotp = dotp_sse;
97  } else {
98      /* We don't care about alignment, just use the array as-is */
99      a->w = a->w_arr;
100      a->dotp = dotp;
101  }
102
103  return a;
104}
105
106void AEC_done(AEC *a) {
107    pa_assert(a);
108
109    pa_xfree(a->Fx);
110    pa_xfree(a->Fe);
111    pa_xfree(a->acMic);
112    pa_xfree(a->acSpk);
113    pa_xfree(a->cutoff);
114    pa_xfree(a);
115}
116
117// Adrian soft decision DTD
118// (Dual Average Near-End to Far-End signal Ratio DTD)
119// This algorithm uses exponential smoothing with different
120// ageing parameters to get fast and slow near-end and far-end
121// signal averages. The ratio of NFRs term
122// (dfast / xfast) / (dslow / xslow) is used to compute the stepsize
123// A ratio value of 2.5 is mapped to stepsize 0, a ratio of 0 is
124// mapped to 1.0 with a limited linear function.
125static float AEC_dtd(AEC *a, REAL d, REAL x)
126{
127  float ratio, stepsize;
128
129  // fast near-end and far-end average
130  a->dfast += ALPHAFAST * (fabsf(d) - a->dfast);
131  a->xfast += ALPHAFAST * (fabsf(x) - a->xfast);
132
133  // slow near-end and far-end average
134  a->dslow += ALPHASLOW * (fabsf(d) - a->dslow);
135  a->xslow += ALPHASLOW * (fabsf(x) - a->xslow);
136
137  if (a->xfast < M70dB_PCM) {
138    return 0.0f;   // no Spk signal
139  }
140
141  if (a->dfast < M70dB_PCM) {
142    return 0.0f;   // no Mic signal
143  }
144
145  // ratio of NFRs
146  ratio = (a->dfast * a->xslow) / (a->dslow * a->xfast);
147
148  // Linear interpolation with clamping at the limits
149  if (ratio < STEPX1)
150    stepsize = STEPY1;
151  else if (ratio > STEPX2)
152    stepsize = STEPY2;
153  else
154    stepsize = STEPY1 + (STEPY2 - STEPY1) * (ratio - STEPX1) / (STEPX2 - STEPX1);
155
156  return stepsize;
157}
158
159
160static void AEC_leaky(AEC *a)
161// The xfast signal is used to charge the hangover timer to Thold.
162// When hangover expires (no Spk signal for some time) the vector w
163// is erased. This is my implementation of Leaky NLMS.
164{
165  if (a->xfast >= M70dB_PCM) {
166    // vector w is valid for hangover Thold time
167    a->hangover = Thold;
168  } else {
169    if (a->hangover > 1) {
170      --(a->hangover);
171    } else if (1 == a->hangover) {
172      --(a->hangover);
173      // My Leaky NLMS is to erase vector w when hangover expires
174      memset(a->w_arr, 0, sizeof(a->w_arr));
175    }
176  }
177}
178
179
180#if 0
181void AEC::openwdisplay() {
182  // open TCP connection to program wdisplay.tcl
183  fdwdisplay = socket_async("127.0.0.1", 50999);
184};
185#endif
186
187
188static REAL AEC_nlms_pw(AEC *a, REAL d, REAL x_, float stepsize)
189{
190  REAL e;
191  REAL ef;
192  a->x[a->j] = x_;
193  a->xf[a->j] = IIR1_highpass(a->Fx, x_);     // pre-whitening of x
194
195  // calculate error value
196  // (mic signal - estimated mic signal from spk signal)
197  e = d;
198  if (a->hangover > 0) {
199    e -= a->dotp(a->w, a->x + a->j);
200  }
201  ef = IIR1_highpass(a->Fe, e);     // pre-whitening of e
202
203  // optimize: iterative dotp(xf, xf)
204  a->dotp_xf_xf += (a->xf[a->j] * a->xf[a->j] - a->xf[a->j + NLMS_LEN - 1] * a->xf[a->j + NLMS_LEN - 1]);
205
206  if (stepsize > 0.0f) {
207    // calculate variable step size
208    REAL mikro_ef = stepsize * ef / a->dotp_xf_xf;
209
210#ifdef DISABLE_ORC
211    // update tap weights (filter learning)
212    int i;
213    for (i = 0; i < NLMS_LEN; i += 2) {
214      // optimize: partial loop unrolling
215      a->w[i] += mikro_ef * a->xf[i + a->j];
216      a->w[i + 1] += mikro_ef * a->xf[i + a->j + 1];
217    }
218#else
219    update_tap_weights(a->w, &a->xf[a->j], mikro_ef, NLMS_LEN);
220#endif
221  }
222
223  if (--(a->j) < 0) {
224    // optimize: decrease number of memory copies
225    a->j = NLMS_EXT;
226    memmove(a->x + a->j + 1, a->x, (NLMS_LEN - 1) * sizeof(REAL));
227    memmove(a->xf + a->j + 1, a->xf, (NLMS_LEN - 1) * sizeof(REAL));
228  }
229
230  // Saturation
231  if (e > MAXPCM) {
232    return MAXPCM;
233  } else if (e < -MAXPCM) {
234    return -MAXPCM;
235  } else {
236    return e;
237  }
238}
239
240
241int AEC_doAEC(AEC *a, int d_, int x_)
242{
243  REAL d = (REAL) d_;
244  REAL x = (REAL) x_;
245
246  // Mic Highpass Filter - to remove DC
247  d = IIR_HP_highpass(a->acMic, d);
248
249  // Mic Highpass Filter - cut-off below 300Hz
250  d = FIR_HP_300Hz_highpass(a->cutoff, d);
251
252  // Amplify, for e.g. Soundcards with -6dB max. volume
253  d *= a->gain;
254
255  // Spk Highpass Filter - to remove DC
256  x = IIR_HP_highpass(a->acSpk, x);
257
258  // Double Talk Detector
259  a->stepsize = AEC_dtd(a, d, x);
260
261  // Leaky (ageing of vector w)
262  AEC_leaky(a);
263
264  // Acoustic Echo Cancellation
265  d = AEC_nlms_pw(a, d, x, a->stepsize);
266
267#if 0
268  if (fdwdisplay >= 0) {
269    if (++dumpcnt >= (WIDEB*RATE/10)) {
270      // wdisplay creates 10 dumps per seconds = large CPU load!
271      dumpcnt = 0;
272      write(fdwdisplay, ws, DUMP_LEN*sizeof(float));
273      // we don't check return value. This is not production quality!!!
274      memset(ws, 0, sizeof(ws));
275    } else {
276      int i;
277      for (i = 0; i < DUMP_LEN; i += 2) {
278        // optimize: partial loop unrolling
279        ws[i] += w[i];
280        ws[i + 1] += w[i + 1];
281      }
282    }
283  }
284#endif
285
286  return (int) d;
287}
288