# https://tmkk.undo.jp/lame/lame-3.100-sse-20171014.diff --- libmp3lame/fft.c.orig 2017-09-07 04:33:36.000000000 +0900 +++ libmp3lame/fft.c 2017-10-14 12:03:44.000000000 +0900 @@ -331,7 +331,7 @@ init_fft(lame_internal_flags * const gfc } #else #ifdef HAVE_XMMINTRIN_H -#ifdef MIN_ARCH_SSE +#if defined(MIN_ARCH_SSE) || defined(__x86_64__) gfc->fft_fht = fht_SSE2; #endif #endif --- libmp3lame/gain_analysis.c.orig 2017-10-11 04:08:39.000000000 +0900 +++ libmp3lame/gain_analysis.c 2017-10-14 12:06:19.000000000 +0900 @@ -95,6 +95,9 @@ #include #include #include +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) +#include +#endif #include "lame.h" #include "machine.h" @@ -109,6 +112,67 @@ /*lint -save -e736 loss of precision */ +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) +static const Float_t ABYule[9][2 * YULE_ORDER + 1 + 3] __attribute__ ((aligned (16))) = { + {0.03857599435200, -3.84664617118067, -0.02160367184185, 7.81501653005538, -0.00123395316851, + -11.34170355132042, -0.00009291677959, 13.05504219327545, -0.01655260341619, + -12.28759895145294, 0.02161526843274, 9.48293806319790, -0.02074045215285, -5.87257861775999, + 0.00594298065125, 2.75465861874613, 0.00306428023191, -0.86984376593551, 0.00012025322027, + 0.13919314567432, 0.00288463683916, 0.0, 0.0, 0.0}, + {0.05418656406430, -3.47845948550071, -0.02911007808948, 6.36317777566148, -0.00848709379851, + -8.54751527471874, -0.00851165645469, 9.47693607801280, -0.00834990904936, -8.81498681370155, + 0.02245293253339, 6.85401540936998, -0.02596338512915, -4.39470996079559, 0.01624864962975, + 2.19611684890774, -0.00240879051584, -0.75104302451432, 0.00674613682247, 0.13149317958808, + -0.00187763777362, 0.0, 0.0, 0.0}, + {0.15457299681924, -2.37898834973084, -0.09331049056315, 2.84868151156327, -0.06247880153653, + -2.64577170229825, 0.02163541888798, 2.23697657451713, -0.05588393329856, -1.67148153367602, + 0.04781476674921, 1.00595954808547, 0.00222312597743, -0.45953458054983, 0.03174092540049, + 0.16378164858596, -0.01390589421898, -0.05032077717131, 0.00651420667831, 0.02347897407020, + -0.00881362733839, 0.0, 0.0, 0.0}, + {0.30296907319327, -1.61273165137247, -0.22613988682123, 1.07977492259970, -0.08587323730772, + -0.25656257754070, 0.03282930172664, -0.16276719120440, -0.00915702933434, -0.22638893773906, + -0.02364141202522, 0.39120800788284, -0.00584456039913, -0.22138138954925, 0.06276101321749, + 0.04500235387352, -0.00000828086748, 0.02005851806501, 0.00205861885564, 0.00302439095741, + -0.02950134983287, 0.0, 0.0, 0.0}, + {0.33642304856132, -1.49858979367799, -0.25572241425570, 0.87350271418188, -0.11828570177555, + 0.12205022308084, 0.11921148675203, -0.80774944671438, -0.07834489609479, 0.47854794562326, + -0.00469977914380, -0.12453458140019, -0.00589500224440, -0.04067510197014, 0.05724228140351, + 0.08333755284107, 0.00832043980773, -0.04237348025746, -0.01635381384540, 0.02977207319925, + -0.01760176568150, 0.0, 0.0, 0.0}, + {0.44915256608450, -0.62820619233671, -0.14351757464547, 0.29661783706366, -0.22784394429749, + -0.37256372942400, -0.01419140100551, 0.00213767857124, 0.04078262797139, -0.42029820170918, + -0.12398163381748, 0.22199650564824, 0.04097565135648, 0.00613424350682, 0.10478503600251, + 0.06747620744683, -0.01863887810927, 0.05784820375801, -0.03193428438915, 0.03222754072173, + 0.00541907748707, 0.0, 0.0, 0.0}, + {0.56619470757641, -1.04800335126349, -0.75464456939302, 0.29156311971249, 0.16242137742230, + -0.26806001042947, 0.16744243493672, 0.00819999645858, -0.18901604199609, 0.45054734505008, + 0.30931782841830, -0.33032403314006, -0.27562961986224, 0.06739368333110, 0.00647310677246, + -0.04784254229033, 0.08647503780351, 0.01639907836189, -0.03788984554840, 0.01807364323573, + -0.00588215443421, 0.0, 0.0, 0.0}, + {0.58100494960553, -0.51035327095184, -0.53174909058578, -0.31863563325245, -0.14289799034253, + -0.20256413484477, 0.17520704835522, 0.14728154134330, 0.02377945217615, 0.38952639978999, + 0.15558449135573, -0.23313271880868, -0.25344790059353, -0.05246019024463, 0.01628462406333, + -0.02505961724053, 0.06920467763959, 0.02442357316099, -0.03721611395801, 0.01818801111503, + -0.00749618797172, 0.0, 0.0, 0.0}, + {0.53648789255105, -0.25049871956020, -0.42163034350696, -0.43193942311114, -0.00275953611929, + -0.03424681017675, 0.04267842219415, -0.04678328784242, -0.10214864179676, 0.26408300200955, + 0.14590772289388, 0.15113130533216, -0.02459864859345, -0.17556493366449, -0.11202315195388, + -0.18823009262115, -0.04060034127000, 0.05477720428674, 0.04788665548180, 0.04704409688120, + -0.02217936801134, 0.0, 0.0, 0.0} +}; + +static const Float_t ABButter[9][2 * BUTTER_ORDER + 1 + 3] __attribute__ ((aligned (16))) = { + {0.98621192462708, -1.97223372919527, -1.97242384925416, 0.97261396931306, 0.98621192462708, 0.0, 0.0, 0.0}, + {0.98500175787242, -1.96977855582618, -1.97000351574484, 0.97022847566350, 0.98500175787242, 0.0, 0.0, 0.0}, + {0.97938932735214, -1.95835380975398, -1.95877865470428, 0.95920349965459, 0.97938932735214, 0.0, 0.0, 0.0}, + {0.97531843204928, -1.95002759149878, -1.95063686409857, 0.95124613669835, 0.97531843204928, 0.0, 0.0, 0.0}, + {0.97316523498161, -1.94561023566527, -1.94633046996323, 0.94705070426118, 0.97316523498161, 0.0, 0.0, 0.0}, + {0.96454515552826, -1.92783286977036, -1.92909031105652, 0.93034775234268, 0.96454515552826, 0.0, 0.0, 0.0}, + {0.96009142950541, -1.91858953033784, -1.92018285901082, 0.92177618768381, 0.96009142950541, 0.0, 0.0, 0.0}, + {0.95856916599601, -1.91542108074780, -1.91713833199203, 0.91885558323625, 0.95856916599601, 0.0, 0.0, 0.0}, + {0.94597685600279, -1.88903307939452, -1.89195371200558, 0.89487434461664, 0.94597685600279, 0.0, 0.0, 0.0} +}; +#else static const Float_t ABYule[9][multiple_of(4, 2 * YULE_ORDER + 1)] = { /* 20 18 16 14 12 10 8 6 4 2 0 19 17 15 13 11 9 7 5 3 1 */ { 0.00288463683916, 0.00012025322027, 0.00306428023191, 0.00594298065125, -0.02074045215285, 0.02161526843274, -0.01655260341619, -0.00009291677959, -0.00123395316851, -0.02160367184185, 0.03857599435200, 0.13919314567432, -0.86984376593551, 2.75465861874613, -5.87257861775999, 9.48293806319790,-12.28759895145294, 13.05504219327545,-11.34170355132042, 7.81501653005538, -3.84664617118067}, @@ -133,7 +197,8 @@ static const Float_t ABButter[9][multipl {0.96009142950541, 0.92177618768381, -1.92018285901082, -1.91858953033784, 0.96009142950541}, {0.95856916599601, 0.91885558323625, -1.91713833199203, -1.91542108074780, 0.95856916599601}, {0.94597685600279, 0.89487434461664, -1.89195371200558, -1.88903307939452, 0.94597685600279} -}; + }; +#endif /*lint -restore */ @@ -143,6 +208,128 @@ static const Float_t ABButter[9][multipl /* When calling this procedure, make sure that ip[-order] and op[-order] point to real data! */ +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) +static void +filterYule(const Float_t * input, Float_t * output, size_t nSamples, const Float_t * const kernel) +{ + __m128 v1, v2, v3, v4, v5, v6, v7, v8; + __asm__ __volatile__ ( + "movups -12(%8), %0 \n\t" + "movups -28(%8), %1 \n\t" + "movlps -36(%8), %2 \n\t" + "movups -16(%9), %3 \n\t" + "movups -32(%9), %4 \n\t" + "movlps -40(%9), %5 \n\t" + "movaps %0, %6 \n\t" + "movaps %1, %7 \n\t" + "unpckhps %3, %0 \n\t" + "unpckhps %4, %1 \n\t" + "shufps $0x4e, %0, %0 \n\t" + "shufps $0x4e, %1, %1 \n\t" + "unpcklps %3, %6 \n\t" + "unpcklps %4, %7 \n\t" + "shufps $0x4e, %6, %6 \n\t" + "shufps $0x4e, %7, %7 \n\t" + "unpcklps %5, %2 \n\t" + "shufps $0x4e, %2, %2 \n\t" + "movss -40(%8), %3 \n\t" + "jmp 2f \n\t" + "1: \n\t" + "movhlps %2, %3 \n\t" + "movaps %7, %5 \n\t" + "shufps $0x4e, %2, %5 \n\t" + "movaps %5, %2 \n\t" + "movaps %1, %5 \n\t" + "shufps $0x4e, %7, %5 \n\t" + "movaps %5, %7 \n\t" + "movaps %6, %5 \n\t" + "shufps $0x4e, %1, %5 \n\t" + "movaps %5, %1 \n\t" + "movaps %0, %5 \n\t" + "shufps $0x4e, %6, %5 \n\t" + "movaps %5, %6 \n\t" + "movss (%8), %5 \n\t" + "shufps $0x00, %5, %4 \n\t" + "shufps $0x42, %0, %4 \n\t" + "movaps %4, %0 \n\t" + "2: \n\t" + "movaps %0, %4 \n\t" + "movaps %6, %5 \n\t" + "mulps (%11), %4 \n\t" + "mulps 16(%11), %5 \n\t" + "addps %5, %4 \n\t" + "movaps %1, %5 \n\t" + "mulps 32(%11), %5 \n\t" + "addps %5, %4 \n\t" + "movaps %7, %5 \n\t" + "mulps 48(%11), %5 \n\t" + "addps %5, %4 \n\t" + "movaps %2, %5 \n\t" + "mulps 64(%11), %5 \n\t" + "addps %5, %4 \n\t" + "mulps 80(%11), %3 \n\t" + "addps %3, %4 \n\t" + "movhlps %4, %5 \n\t" + "addps %5, %4 \n\t" +#if defined(__SSE3__) + "hsubps %4, %4 \n\t" +#else + "movaps %4, %5 \n\t" + "shufps $0x01, %5, %5 \n\t" + "subps %5, %4 \n\t" +#endif + "movss %4, (%9) \n\t" + "add $4, %8 \n\t" + "add $4, %9 \n\t" + "dec %10 \n\t" + "jnz 1b \n\t" + : "=x" (v1), "=x" (v2), "=x" (v3), "=x" (v4), "=x" (v5), "=x" (v6), "=x" (v7), "=x" (v8), + "+r" (input), "+r" (output), "+r" (nSamples) + : "r" (kernel) + ); +} + +static void +filterButter(const Float_t * input, Float_t * output, size_t nSamples, const Float_t * const kernel) +{ + __m128 v1, v2, v3, v4, v5; + __asm__ __volatile__ ( + "movlps -4(%5), %0 \n\t" + "movlps -8(%6), %2 \n\t" + "unpcklps %2, %0 \n\t" + "shufps $0x4e, %0, %0 \n\t" + "movss -8(%5), %4 \n\t" + "movaps %0, %1 \n\t" + "jmp 2f \n\t" + "1: \n\t" + "movhlps %0, %4 \n\t" + "movss (%5), %2 \n\t" + "shufps $0x00, %2, %1 \n\t" + "shufps $0x42, %0, %1 \n\t" + "movaps %1, %0 \n\t" + "2: \n\t" + "mulps (%8), %1 \n\t" + "mulps 16(%8), %4 \n\t" + "addps %4, %1 \n\t" + "movhlps %1, %2 \n\t" + "addps %2, %1 \n\t" +#if defined(__SSE3__) + "hsubps %1, %1 \n\t" +#else + "movaps %1, %2 \n\t" + "shufps $0x01, %2, %2 \n\t" + "subps %2, %1 \n\t" +#endif + "movss %1, (%6) \n\t" + "add $4, %5 \n\t" + "add $4, %6 \n\t" + "dec %7 \n\t" + "jnz 1b \n\t" + : "=x" (v1), "=x" (v2), "=x" (v3), "=x" (v4), "=x" (v5), "+r" (input), "+r" (output), "+r" (nSamples) + : "r" (kernel) + ); +} +#else static void filterYule(const Float_t * input, Float_t * output, size_t nSamples, const Float_t * const kernel) { @@ -188,6 +375,7 @@ filterButter(const Float_t * input, Floa ++input; } } +#endif --- libmp3lame/l3side.h.orig 2012-02-07 22:36:35.000000000 +0900 +++ libmp3lame/l3side.h 2017-10-14 12:03:44.000000000 +0900 @@ -46,7 +46,7 @@ typedef struct { typedef struct { FLOAT xr[576]; - int l3_enc[576]; + int l3_enc[576] __attribute__ ((aligned (16))); int scalefac[SFBMAX]; FLOAT xrpow_max; @@ -84,7 +84,7 @@ typedef struct { } gr_info; typedef struct { - gr_info tt[2][2]; + gr_info tt[2][2] __attribute__ ((aligned (16))); int main_data_begin; int private_bits; int resvDrain_pre; --- libmp3lame/lame.c.orig 2017-10-11 04:08:39.000000000 +0900 +++ libmp3lame/lame.c 2017-10-14 12:03:44.000000000 +0900 @@ -2364,7 +2364,7 @@ lame_init_internal_flags(lame_internal_f gfc->ov_rpg.noclipGainChange = 0; gfc->ov_rpg.noclipScale = -1.0; - gfc->ATH = lame_calloc(ATH_t, 1); + gfc->ATH = calloc_aligned16(1, sizeof(ATH_t)); if (NULL == gfc->ATH) return -2; /* maybe error codes should be enumerated in lame.h ?? */ @@ -2455,7 +2455,7 @@ lame_init_old(lame_global_flags * gfp) gfp->report.errorf = &lame_report_def; gfp->report.msgf = &lame_report_def; - gfp->internal_flags = lame_calloc(lame_internal_flags, 1); + gfp->internal_flags = calloc_aligned16(1, sizeof(lame_internal_flags)); if (lame_init_internal_flags(gfp->internal_flags) < 0) { freegfc(gfp->internal_flags); --- libmp3lame/newmdct.c.orig 2011-05-08 01:05:17.000000000 +0900 +++ libmp3lame/newmdct.c 2017-10-14 12:03:44.000000000 +0900 @@ -36,10 +36,13 @@ #include "util.h" #include "newmdct.h" +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) +#include +#endif #ifndef USE_GOGO_SUBBAND -static const FLOAT enwindow[] = { +static const FLOAT enwindow[] __attribute__ ((aligned (16))) = { -4.77e-07 * 0.740951125354959 / 2.384e-06, 1.03951e-04 * 0.740951125354959 / 2.384e-06, 9.53674e-04 * 0.740951125354959 / 2.384e-06, 2.841473e-03 * 0.740951125354959 / 2.384e-06, 3.5758972e-02 * 0.740951125354959 / 2.384e-06, 3.401756e-03 * 0.740951125354959 / 2.384e-06, 9.83715e-04 * 0.740951125354959 / 2.384e-06, 9.9182e-05 * 0.740951125354959 / 2.384e-06, /* 15 */ @@ -435,6 +438,241 @@ window_subband(const sample_t * x1, FLOA const sample_t *x2 = &x1[238 - 14 - 286]; +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) + __m128 v1, v2, v3, v4, v5, v6, v7, v8; + i=4; + __asm__ __volatile__ ( + "1: \n\t" + "movaps -40(%12), %0 \n\t" + "movups 32(%12), %1 \n\t" + "movaps 104(%12), %2 \n\t" + "movups 176(%12), %3 \n\t" + "movaps %0, %4 \n\t" + "movaps %2, %5 \n\t" + "unpcklps %1, %0 \n\t" + "unpcklps %3, %2 \n\t" + "unpckhps %1, %4 \n\t" + "unpckhps %3, %5 \n\t" + "movaps %0, %1 \n\t" + "movlhps %2, %0 \n\t" + "movhlps %1, %2 \n\t" + "movaps %4, %1 \n\t" + "movlhps %5, %1 \n\t" + "movhlps %4, %5 \n\t" + "movaps %5, %3 \n\t" + "movups 884(%9), %6 \n\t" + "movups -896(%10), %7 \n\t" + "shufps $0x1b, %6, %6 \n\t" + "mulps %0, %6 \n\t" + "mulps %0, %7 \n\t" + "movups 628(%9), %4 \n\t" + "movups -640(%10), %5 \n\t" + "shufps $0x1b, %4, %4 \n\t" + "mulps %2, %4 \n\t" + "mulps %2, %5 \n\t" + "addps %4, %6 \n\t" + "addps %5, %7 \n\t" + "movups 372(%9), %4 \n\t" + "movups -384(%10), %5 \n\t" + "shufps $0x1b, %4, %4 \n\t" + "mulps %1, %4 \n\t" + "mulps %1, %5 \n\t" + "addps %4, %6 \n\t" + "addps %5, %7 \n\t" + "movups 116(%9), %4 \n\t" + "movups -128(%10), %5 \n\t" + "shufps $0x1b, %4, %4 \n\t" + "mulps %3, %4 \n\t" + "mulps %3, %5 \n\t" + "addps %4, %6 \n\t" + "addps %5, %7 \n\t" + + "movaps -24(%12), %0 \n\t" + "movups 48(%12), %1 \n\t" + "movaps 120(%12), %2 \n\t" + "movups 192(%12), %3 \n\t" + "movaps %0, %4 \n\t" + "movaps %2, %5 \n\t" + "unpcklps %1, %0 \n\t" + "unpcklps %3, %2 \n\t" + "unpckhps %1, %4 \n\t" + "unpckhps %3, %5 \n\t" + "movaps %0, %1 \n\t" + "movlhps %2, %0 \n\t" + "movhlps %1, %2 \n\t" + "movaps %4, %1 \n\t" + "movlhps %5, %1 \n\t" + "movhlps %4, %5 \n\t" + "movaps %5, %3 \n\t" + "movups -140(%9), %4 \n\t" + "movups 128(%10), %5 \n\t" + "shufps $0x1b, %4, %4 \n\t" + "mulps %0, %4 \n\t" + "mulps %0, %5 \n\t" + "addps %4, %6 \n\t" + "addps %5, %7 \n\t" + "movups -396(%9), %4 \n\t" + "movups 384(%10), %5 \n\t" + "shufps $0x1b, %4, %4 \n\t" + "mulps %2, %4 \n\t" + "mulps %2, %5 \n\t" + "addps %4, %6 \n\t" + "addps %5, %7 \n\t" + "movups -652(%9), %4 \n\t" + "movups 640(%10), %5 \n\t" + "shufps $0x1b, %4, %4 \n\t" + "mulps %1, %4 \n\t" + "mulps %1, %5 \n\t" + "addps %4, %6 \n\t" + "addps %5, %7 \n\t" + "movups -908(%9), %4 \n\t" + "movups 896(%10), %5 \n\t" + "shufps $0x1b, %4, %4 \n\t" + "mulps %3, %4 \n\t" + "mulps %3, %5 \n\t" + "addps %4, %6 \n\t" + "addps %5, %7 \n\t" + + "movaps -8(%12), %0 \n\t" + "movups 64(%12), %1 \n\t" + "movaps 136(%12), %2 \n\t" + "movups 208(%12), %3 \n\t" + "movaps %0, %4 \n\t" + "movaps %2, %5 \n\t" + "unpcklps %1, %0 \n\t" + "unpcklps %3, %2 \n\t" + "unpckhps %1, %4 \n\t" + "unpckhps %3, %5 \n\t" + "movaps %0, %1 \n\t" + "movlhps %2, %0 \n\t" + "movhlps %1, %2 \n\t" + "movaps %4, %1 \n\t" + "movlhps %5, %1 \n\t" + "movhlps %4, %5 \n\t" + "movaps %5, %3 \n\t" + "movups -1036(%9), %4 \n\t" + "movups 1024(%10), %5 \n\t" + "shufps $0x1b, %4, %4 \n\t" + "mulps %0, %4 \n\t" + "mulps %0, %5 \n\t" + "addps %4, %7 \n\t" + "subps %5, %6 \n\t" + "movups -780(%9), %4 \n\t" + "movups 768(%10), %5 \n\t" + "shufps $0x1b, %4, %4 \n\t" + "mulps %2, %4 \n\t" + "mulps %2, %5 \n\t" + "addps %4, %7 \n\t" + "subps %5, %6 \n\t" + "movups -524(%9), %4 \n\t" + "movups 512(%10), %5 \n\t" + "shufps $0x1b, %4, %4 \n\t" + "mulps %1, %4 \n\t" + "mulps %1, %5 \n\t" + "addps %4, %7 \n\t" + "subps %5, %6 \n\t" + "movups -268(%9), %4 \n\t" + "movups 256(%10), %5 \n\t" + "shufps $0x1b, %4, %4 \n\t" + "mulps %3, %4 \n\t" + "mulps %3, %5 \n\t" + "addps %4, %7 \n\t" + "subps %5, %6 \n\t" + + "movaps 8(%12), %0 \n\t" + "movups 80(%12), %1 \n\t" + "movaps 152(%12), %2 \n\t" + "movups 224(%12), %3 \n\t" + "movaps %0, %4 \n\t" + "movaps %2, %5 \n\t" + "unpcklps %1, %0 \n\t" + "unpcklps %3, %2 \n\t" + "unpckhps %1, %4 \n\t" + "unpckhps %3, %5 \n\t" + "movaps %0, %1 \n\t" + "movlhps %2, %0 \n\t" + "movhlps %1, %2 \n\t" + "movaps %4, %1 \n\t" + "movlhps %5, %1 \n\t" + "movhlps %4, %5 \n\t" + "movaps %5, %3 \n\t" + "movups -12(%9), %4 \n\t" + "movups (%10), %5 \n\t" + "shufps $0x1b, %4, %4 \n\t" + "mulps %0, %4 \n\t" + "mulps %0, %5 \n\t" + "addps %4, %7 \n\t" + "subps %5, %6 \n\t" + "movups 244(%9), %4 \n\t" + "movups -256(%10), %5 \n\t" + "shufps $0x1b, %4, %4 \n\t" + "mulps %2, %4 \n\t" + "mulps %2, %5 \n\t" + "addps %4, %7 \n\t" + "subps %5, %6 \n\t" + "movups 500(%9), %4 \n\t" + "movups -512(%10), %5 \n\t" + "shufps $0x1b, %4, %4 \n\t" + "mulps %1, %4 \n\t" + "mulps %1, %5 \n\t" + "addps %4, %7 \n\t" + "subps %5, %6 \n\t" + "movups 756(%9), %4 \n\t" + "movups -768(%10), %5 \n\t" + "shufps $0x1b, %4, %4 \n\t" + "mulps %3, %4 \n\t" + "mulps %3, %5 \n\t" + "addps %4, %7 \n\t" + "subps %5, %6 \n\t" + + "movlps 24(%12), %0 \n\t" + "movlps 96(%12), %1 \n\t" + "movlps 168(%12), %2 \n\t" + "movlps 240(%12), %3 \n\t" + "unpcklps %1, %0 \n\t" + "unpcklps %3, %2 \n\t" + "movaps %0, %1 \n\t" + "movlhps %2, %0 \n\t" + "movhlps %1, %2 \n\t" + "mulps %0, %7 \n\t" + "movaps %6, %4 \n\t" + "subps %7, %4 \n\t" + "addps %7, %6 \n\t" + "mulps %2, %4 \n\t" + "movaps %6, %5 \n\t" + "unpcklps %4, %6 \n\t" + "unpckhps %4, %5 \n\t" + "movups %6, (%11) \n\t" + "movups %5, 16(%11) \n\t" + +#if defined(__x86_64__) + "subq $16, %9 \n\t" + "addq $16, %10 \n\t" + "addq $288, %12 \n\t" + "addq $32, %11 \n\t" + "decl %8 \n\t" + "jnz 1b \n\t" + "addq $4, %9 \n\t" + "subq $4, %10 \n\t" + "subq $128, %11 \n\t" +#else + "subl $16, %9 \n\t" + "addl $16, %10 \n\t" + "addl $288, %12 \n\t" + "addl $32, %11 \n\t" + "decl %8 \n\t" + "jnz 1b \n\t" + "addl $4, %9 \n\t" + "subl $4, %10 \n\t" + "subl $128, %11 \n\t" +#endif + : "=x" (v1), "=x" (v2), "=x" (v3), "=x" (v4), "=x" (v5), "=x" (v6), "=x" (v7), "=x" (v8), + "+r" (i), "+r" (x1), "+r" (x2), "+r" (a) + : "r" (wp) + : "memory" + ); + wp = enwindow + 280; +#else for (i = -15; i < 0; i++) { FLOAT w, s, t; @@ -501,6 +739,7 @@ window_subband(const sample_t * x1, FLOA x1--; x2++; } +#endif { FLOAT s, t, u, v; t = x1[-16] * wp[-10]; --- libmp3lame/psymodel.c.orig 2017-09-07 04:38:23.000000000 +0900 +++ libmp3lame/psymodel.c 2017-10-14 12:03:44.000000000 +0900 @@ -155,6 +155,9 @@ blocktype_d[2] block type to use #include "fft.h" #include "lame-analysis.h" +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) +#include +#endif #define NSFIRLEN 21 @@ -218,10 +221,58 @@ psycho_loudness_approx(FLOAT const *ener int i; FLOAT loudness_power; +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) + __m128 v1, v2, v3, v4, v5, v6, v7, v8; + i = 32; + __asm__ __volatile__ ( + "xorps %0, %0 \n\t" + "xorps %1, %1 \n\t" + "xorps %2, %2 \n\t" + "xorps %3, %3 \n\t" + "1: \n\t" + "movaps (%9), %4 \n\t" + "movaps 16(%9), %5 \n\t" + "movaps 32(%9), %6 \n\t" + "movaps 48(%9), %7 \n\t" + "mulps (%10), %4 \n\t" + "mulps 16(%10), %5 \n\t" + "mulps 32(%10), %6 \n\t" + "mulps 48(%10), %7 \n\t" + "addps %4, %0 \n\t" + "addps %5, %1 \n\t" + "addps %6, %2 \n\t" + "addps %7, %3 \n\t" +#if defined(__x86_64__) + "addq $64, %9 \n\t" + "addq $64, %10 \n\t" +#else + "addl $64, %9 \n\t" + "addl $64, %10 \n\t" +#endif + "decl %8 \n\t" + "jnz 1b \n\t" + "addps %1, %0 \n\t" + "addps %3, %2 \n\t" + "addps %2, %0 \n\t" + "movhlps %0, %1 \n\t" + "addps %1, %0 \n\t" +#if defined(__SSE3__) + "haddps %0, %0 \n\t" +#else + "movaps %0, %1 \n\t" + "shufps $0x01, %1, %1 \n\t" + "addps %1, %0 \n\t" +#endif + : "=x" (v1), "=x" (v2), "=x" (v3), "=x" (v4), "=x" (v5), "=x" (v6), "=x" (v7), "=x" (v8), + "+r" (i), "+r" (eql_w), "+r" (energy) + ); + _mm_store_ss(&loudness_power, v1); +#else loudness_power = 0.0; /* apply weights to power in freq. bands */ for (i = 0; i < BLKSIZE / 2; ++i) loudness_power += energy[i] * eql_w[i]; +#endif loudness_power *= VO_SCALE; return loudness_power; @@ -666,6 +717,9 @@ static void vbrpsy_compute_fft_l(lame_internal_flags * gfc, const sample_t * const buffer[2], int chn, int gr_out, FLOAT fftenergy[HBLKSIZE], FLOAT(*wsamp_l)[BLKSIZE]) { +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) + __m128 v1, v2, v3, v4, v5, v6, v7; +#endif SessionConfig_t const *const cfg = &gfc->cfg; PsyStateVar_t *psv = &gfc->sv_psy; plotting_data *plt = cfg->analysis ? gfc->pinfo : 0; @@ -676,6 +730,47 @@ vbrpsy_compute_fft_l(lame_internal_flags } else if (chn == 2) { FLOAT const sqrt2_half = SQRT2 * 0.5f; +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) + FLOAT *wsamp_lp1 = *wsamp_l; + FLOAT *wsamp_lp2 = *wsamp_l+1024; + j = 128; + __asm__ __volatile__ ( + "movss (%10), %6 \n\t" + "shufps $0x00, %6, %6 \n\t" + "1: \n\t" + "movaps (%8), %0 \n\t" + "movaps 16(%8), %1 \n\t" + "movaps (%9), %2 \n\t" + "movaps 16(%9), %3 \n\t" + "movaps %0, %4 \n\t" + "movaps %1, %5 \n\t" + "addps %2, %0 \n\t" + "addps %3, %1 \n\t" + "subps %2, %4 \n\t" + "subps %3, %5 \n\t" + "mulps %6, %0 \n\t" + "mulps %6, %1 \n\t" + "mulps %6, %4 \n\t" + "mulps %6, %5 \n\t" + "movaps %0, (%8) \n\t" + "movaps %1, 16(%8) \n\t" + "movaps %4, 0(%9) \n\t" + "movaps %5, 16(%9) \n\t" +#if defined(__x86_64__) + "addq $32, %8 \n\t" + "addq $32, %9 \n\t" +#else + "addl $32, %8 \n\t" + "addl $32, %9 \n\t" +#endif + "decl %7 \n\t" + "jnz 1b \n\t" + : "=x" (v1), "=x" (v2), "=x" (v3), "=x" (v4), "=x" (v5), "=x" (v6), "=x" (v7), + "+r" (j), "+r" (wsamp_lp1), "+r" (wsamp_lp2) + : "r" (&sqrt2_half) + : "memory" + ); +#else /* FFT data for mid and side channel is derived from L & R */ for (j = BLKSIZE - 1; j >= 0; --j) { FLOAT const l = wsamp_l[0][j]; @@ -683,6 +778,7 @@ vbrpsy_compute_fft_l(lame_internal_flags wsamp_l[0][j] = (l + r) * sqrt2_half; wsamp_l[1][j] = (l - r) * sqrt2_half; } +#endif } /********************************************************************* @@ -691,6 +787,73 @@ vbrpsy_compute_fft_l(lame_internal_flags fftenergy[0] = wsamp_l[0][0]; fftenergy[0] *= fftenergy[0]; +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) + FLOAT *wsamp_lp1 = *wsamp_l+1; + FLOAT *wsamp_lp2 = *wsamp_l+1020; + FLOAT *fftenergyp = fftenergy+1; + j = 64; + __asm__ __volatile__ ( + "pcmpeqd %4, %4 \n\t" + "psrld $26, %4 \n\t" + "pslld $24, %4 \n\t" + "xorps %5, %5 \n\t" + "1: \n\t" + "movups (%7), %0 \n\t" + "movups 16(%7), %1 \n\t" + "movaps (%8), %2 \n\t" + "movaps -16(%8), %3 \n\t" + "shufps $0x1b, %2, %2 \n\t" + "shufps $0x1b, %3, %3 \n\t" + "mulps %0, %0 \n\t" + "mulps %2, %2 \n\t" + "mulps %1, %1 \n\t" + "mulps %3, %3 \n\t" + "addps %2, %0 \n\t" + "addps %3, %1 \n\t" + "mulps %4, %0 \n\t" + "mulps %4, %1 \n\t" + "movups %0, (%9) \n\t" + "movups %1, 16(%9) \n\t" + "addps %1, %0 \n\t" + "addps %0, %5 \n\t" +#if defined(__x86_64__) + "addq $32, %7 \n\t" + "subq $32, %8 \n\t" + "addq $32, %9 \n\t" + "decl %6 \n\t" + "jnz 1b \n\t" + "subq $2048, %9 \n\t" +#else + "addl $32, %7 \n\t" + "subl $32, %8 \n\t" + "addl $32, %9 \n\t" + "decl %6 \n\t" + "jnz 1b \n\t" + "subl $2048, %9 \n\t" +#endif + "movups (%9), %0 \n\t" + "movups 16(%9), %1 \n\t" + "xorps %2, %2 \n\t" + "movlps 32(%9), %2 \n\t" + "addps %1, %0 \n\t" + "addps %2, %0 \n\t" + "subps %0, %5 \n\t" + "movhlps %5, %0 \n\t" + "addps %5, %0 \n\t" +#if defined(__SSE3__) + "haddps %0, %0 \n\t" +#else + "movaps %0, %1 \n\t" + "shufps $0x01, %1, %1 \n\t" + "addps %1, %0 \n\t" +#endif + "movss %0, (%10) \n\t" + : "=x" (v1), "=x" (v2), "=x" (v3), "=x" (v4), "=x" (v5), "=x" (v6), + "+r" (j), "+r" (wsamp_lp1), "+r" (wsamp_lp2), "+r" (fftenergyp) + : "r" (&psv->tot_ener[chn]) + : "memory" + ); +#else for (j = BLKSIZE / 2 - 1; j >= 0; --j) { FLOAT const re = (*wsamp_l)[BLKSIZE / 2 - j]; FLOAT const im = (*wsamp_l)[BLKSIZE / 2 + j]; @@ -704,6 +867,7 @@ vbrpsy_compute_fft_l(lame_internal_flags psv->tot_ener[chn] = totalenergy; } +#endif if (plt) { for (j = 0; j < HBLKSIZE; j++) { @@ -772,7 +936,7 @@ vbrpsy_attack_detection(lame_internal_fl FLOAT energy[4], FLOAT sub_short_factor[4][3], int ns_attacks[4][4], int uselongblock[2]) { - FLOAT ns_hpfsmpl[2][576]; + FLOAT ns_hpfsmpl[2][576] __attribute__ ((aligned (16))); SessionConfig_t const *const cfg = &gfc->cfg; PsyStateVar_t *const psv = &gfc->sv_psy; plotting_data *plt = cfg->analysis ? gfc->pinfo : 0; @@ -785,14 +949,170 @@ vbrpsy_attack_detection(lame_internal_fl /* Don't copy the input buffer into a temporary buffer */ /* unroll the loop 2 times */ for (chn = 0; chn < n_chn_out; chn++) { - static const FLOAT fircoef[] = { + static const FLOAT fircoef[] __attribute__ ((aligned (16))) = { -8.65163e-18 * 2, -0.00851586 * 2, -6.74764e-18 * 2, 0.0209036 * 2, -3.36639e-17 * 2, -0.0438162 * 2, -1.54175e-17 * 2, 0.0931738 * 2, - -5.52212e-17 * 2, -0.313819 * 2 + -5.52212e-17 * 2, -0.313819 * 2, 0, 0 }; /* apply high pass filter of fs/4 */ const sample_t *const firbuf = &buffer[chn][576 - 350 - NSFIRLEN + 192]; - assert(dimension_of(fircoef) == ((NSFIRLEN - 1) / 2)); + //assert(dimension_of(fircoef) == ((NSFIRLEN - 1) / 2)); +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) + __m128 v1, v2, v3, v4, v5, v6, v7; + float *firbufp = (float *)firbuf; + float *ns_hpfsmplp = &ns_hpfsmpl[chn][0]; + i = 144; + __asm__ __volatile__ ( + "1: \n\t" + "movups 40(%8), %0 \n\t" + "xorps %1, %1 \n\t" + "movaps %0, %2 \n\t" + "unpcklps %1, %0 \n\t" + "unpckhps %1, %2 \n\t" + "movaps %2, %1 \n\t" + + "movaps (%10), %2 \n\t" + "movups (%8), %3 \n\t" + "movups 72(%8), %4 \n\t" + "shufps $0x1b, %4, %4 \n\t" + "addps %4, %3 \n\t" + "mulps %3, %2 \n\t" + "movaps 16(%10), %5 \n\t" + "movups 16(%8), %3 \n\t" + "movups 56(%8), %4 \n\t" + "shufps $0x1b, %4, %4 \n\t" + "addps %4, %3 \n\t" + "mulps %3, %5 \n\t" + "addps %5, %2 \n\t" + "movaps 32(%10), %5 \n\t" + "movups 32(%8), %3 \n\t" + "movups 40(%8), %4 \n\t" + "shufps $0x1b, %4, %4 \n\t" + "addps %4, %3 \n\t" + "mulps %3, %5 \n\t" + "addps %5, %2 \n\t" + "movhlps %2, %6 \n\t" + "addps %2, %6 \n\t" +#if defined(__x86_64__) + "addq $4, %8 \n\t" +#else + "addl $4, %8 \n\t" +#endif + + "movaps (%10), %2 \n\t" + "movups (%8), %3 \n\t" + "movups 72(%8), %4 \n\t" + "shufps $0x1b, %4, %4 \n\t" + "addps %4, %3 \n\t" + "mulps %3, %2 \n\t" + "movaps 16(%10), %5 \n\t" + "movups 16(%8), %3 \n\t" + "movups 56(%8), %4 \n\t" + "shufps $0x1b, %4, %4 \n\t" + "addps %4, %3 \n\t" + "mulps %3, %5 \n\t" + "addps %5, %2 \n\t" + "movaps 32(%10), %5 \n\t" + "movups 32(%8), %3 \n\t" + "movups 40(%8), %4 \n\t" + "shufps $0x1b, %4, %4 \n\t" + "addps %4, %3 \n\t" + "mulps %3, %5 \n\t" + "addps %5, %2 \n\t" + "movhlps %2, %3 \n\t" + "addps %3, %2 \n\t" + "movlhps %2, %6 \n\t" + "addps %6, %0 \n\t" +#if defined(__x86_64__) + "addq $4, %8 \n\t" +#else + "addl $4, %8 \n\t" +#endif + + "movaps (%10), %2 \n\t" + "movups (%8), %3 \n\t" + "movups 72(%8), %4 \n\t" + "shufps $0x1b, %4, %4 \n\t" + "addps %4, %3 \n\t" + "mulps %3, %2 \n\t" + "movaps 16(%10), %5 \n\t" + "movups 16(%8), %3 \n\t" + "movups 56(%8), %4 \n\t" + "shufps $0x1b, %4, %4 \n\t" + "addps %4, %3 \n\t" + "mulps %3, %5 \n\t" + "addps %5, %2 \n\t" + "movaps 32(%10), %5 \n\t" + "movups 32(%8), %3 \n\t" + "movups 40(%8), %4 \n\t" + "shufps $0x1b, %4, %4 \n\t" + "addps %4, %3 \n\t" + "mulps %3, %5 \n\t" + "addps %5, %2 \n\t" + "movhlps %2, %6 \n\t" + "addps %2, %6 \n\t" +#if defined(__x86_64__) + "addq $4, %8 \n\t" +#else + "addl $4, %8 \n\t" +#endif + + "movaps (%10), %2 \n\t" + "movups (%8), %3 \n\t" + "movups 72(%8), %4 \n\t" + "shufps $0x1b, %4, %4 \n\t" + "addps %4, %3 \n\t" + "mulps %3, %2 \n\t" + "movaps 16(%10), %5 \n\t" + "movups 16(%8), %3 \n\t" + "movups 56(%8), %4 \n\t" + "shufps $0x1b, %4, %4 \n\t" + "addps %4, %3 \n\t" + "mulps %3, %5 \n\t" + "addps %5, %2 \n\t" + "movaps 32(%10), %5 \n\t" + "movups 32(%8), %3 \n\t" + "movups 40(%8), %4 \n\t" + "shufps $0x1b, %4, %4 \n\t" + "addps %4, %3 \n\t" + "mulps %3, %5 \n\t" + "addps %5, %2 \n\t" + "movhlps %2, %3 \n\t" + "addps %3, %2 \n\t" + "movlhps %2, %6 \n\t" + "addps %6, %1 \n\t" +#if defined(__x86_64__) + "addq $4, %8 \n\t" +#else + "addl $4, %8 \n\t" +#endif + +#if defined(__SSE3__) + "haddps %1, %0 \n\t" +#else + "movaps %0, %2 \n\t" + "movaps %1, %3 \n\t" + "shufps $0x31, %2, %2 \n\t" + "shufps $0x31, %3, %3 \n\t" + "addps %2, %0 \n\t" + "addps %3, %1 \n\t" + "shufps $0x88, %1, %0 \n\t" +#endif + "movaps %0, (%9) \n\t" + +#if defined(__x86_64__) + "addq $16, %9 \n\t" +#else + "addl $16, %9 \n\t" +#endif + "decl %7 \n\t" + "jnz 1b \n\t" + : "=x" (v1), "=x" (v2), "=x" (v3), "=x" (v4), "=x" (v5), "=x" (v6), "=x" (v7), + "+r" (i), "+r" (firbufp), "+r" (ns_hpfsmplp) + : "r" (fircoef) + : "memory" + ); +#else for (i = 0; i < 576; i++) { FLOAT sum1, sum2; sum1 = firbuf[i + 10]; @@ -803,6 +1123,7 @@ vbrpsy_attack_detection(lame_internal_fl } ns_hpfsmpl[chn][i] = sum1 + sum2; } +#endif masking_ratio[gr_out][chn].en = psv->en[chn]; masking_ratio[gr_out][chn].thm = psv->thm[chn]; if (n_chn_psy > 2) { @@ -1423,10 +1744,10 @@ L3psycho_anal_vbr(lame_internal_flags * /* fft and energy calculation */ FLOAT(*wsamp_l)[BLKSIZE]; FLOAT(*wsamp_s)[3][BLKSIZE_s]; - FLOAT fftenergy[HBLKSIZE]; - FLOAT fftenergy_s[3][HBLKSIZE_s]; - FLOAT wsamp_L[2][BLKSIZE]; - FLOAT wsamp_S[2][3][BLKSIZE_s]; + FLOAT fftenergy[HBLKSIZE] __attribute__ ((aligned (16))); + FLOAT fftenergy_s[3][HBLKSIZE_s] __attribute__ ((aligned (16))); + FLOAT wsamp_L[2][BLKSIZE] __attribute__ ((aligned (16))); + FLOAT wsamp_S[2][3][BLKSIZE_s] __attribute__ ((aligned (16))); FLOAT eb[4][CBANDS], thr[4][CBANDS]; FLOAT sub_short_factor[4][3]; --- libmp3lame/quantize.c.orig 2017-08-15 22:40:45.000000000 +0900 +++ libmp3lame/quantize.c 2017-10-14 12:03:44.000000000 +0900 @@ -99,7 +99,7 @@ init_xrpow_core_init(lame_internal_flags gfc->init_xrpow_core = init_xrpow_core_sse; #endif #ifndef HAVE_NASM -#ifdef MIN_ARCH_SSE +#if defined(MIN_ARCH_SSE) || defined(__x86_64__) gfc->init_xrpow_core = init_xrpow_core_sse; #endif #endif @@ -1495,7 +1495,7 @@ VBR_old_iteration_loop(lame_internal_fla EncResult_t *const eov = &gfc->ov_enc; FLOAT l3_xmin[2][2][SFBMAX]; - FLOAT xrpow[576]; + FLOAT xrpow[576] __attribute__ ((aligned (16))); int bands[2][2]; int frameBits[15]; int used_bits; @@ -1904,7 +1904,7 @@ ABR_iteration_loop(lame_internal_flags * SessionConfig_t const *const cfg = &gfc->cfg; EncResult_t *const eov = &gfc->ov_enc; FLOAT l3_xmin[SFBMAX]; - FLOAT xrpow[576]; + FLOAT xrpow[576] __attribute__ ((aligned (16))); int targ_bits[2][2]; int mean_bits, max_frame_bits; int ch, gr, ath_over; @@ -1991,7 +1991,7 @@ CBR_iteration_loop(lame_internal_flags * { SessionConfig_t const *const cfg = &gfc->cfg; FLOAT l3_xmin[SFBMAX]; - FLOAT xrpow[576]; + FLOAT xrpow[576] __attribute__ ((aligned (16))); int targ_bits[2]; int mean_bits, max_bits; int gr, ch; --- libmp3lame/quantize_pvt.c.orig 2017-09-07 04:33:36.000000000 +0900 +++ libmp3lame/quantize_pvt.c 2017-10-14 12:03:44.000000000 +0900 @@ -27,6 +27,7 @@ # include #endif +#undef TAKEHIRO_IEEE754_HACK #include "lame.h" #include "machine.h" @@ -37,6 +38,9 @@ #include "lame-analysis.h" #include +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) +#include +#endif #define NSATHSCALE 100 /* Assuming dynamic range=96dB, this value should be 92 */ @@ -767,6 +771,70 @@ calc_noise_core_c(const gr_info * const } } else if (j > cod_info->big_values) { +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) + __m128 v1, v2, v3, v4, v5; + int remaining = l & 1; + l = l >> 1; + const int *ixp = ix+j; + const FLOAT *xrp = cod_info->xr+j; + j += 4*l; + v5 = _mm_set_ss(step); + __asm__ __volatile__ ( + "xorps %3, %3 \n\t" + "testl %5, %5 \n\t" + "jz 2f \n\t" + "shufps $0x00, %4, %4 \n\t" + "pcmpeqd %1, %1 \n\t" + "psrld $1, %1 \n\t" + + "1: \n\t" + "pxor %0, %0 \n\t" + "movups (%6), %2 \n\t" + "pcmpeqd %0, %2 \n\t" + "pandn %4, %2 \n\t" + + "movups (%7), %0 \n\t" + "andps %1, %0 \n\t" + "subps %2, %0 \n\t" + "mulps %0, %0 \n\t" + "addps %0, %3 \n\t" + +#if defined(__x86_64__) + "addq $16, %6 \n\t" + "addq $16, %7 \n\t" +#else + "addl $16, %6 \n\t" + "addl $16, %7 \n\t" +#endif + "decl %5 \n\t" + "jnz 1b \n\t" + "movhlps %3, %0 \n\t" + "addps %0, %3 \n\t" +#if defined(__SSE3__) + "haddps %3, %3 \n\t" +#else + "movaps %3, %0 \n\t" + "shufps $0x01, %0, %0 \n\t" + "addps %0, %3 \n\t" +#endif + "2: \n\t" + : "=x" (v1), "=x" (v2), "=x" (v3), "=x" (v4), "+x" (v5), + "+r" (l), "+r" (ixp), "+r" (xrp) + ); + _mm_store_ss(&noise, v4); + if (remaining) { + FLOAT ix01[2]; + ix01[0] = 0; + ix01[1] = step; + FLOAT temp; + temp = fabs(cod_info->xr[j]) - ix01[ix[j]]; + j++; + noise += temp * temp; + temp = fabs(cod_info->xr[j]) - ix01[ix[j]]; + j++; + noise += temp * temp; + } +#else FLOAT ix01[2]; ix01[0] = 0; ix01[1] = step; @@ -779,8 +847,95 @@ calc_noise_core_c(const gr_info * const j++; noise += temp * temp; } +#endif } else { +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) + __m128 v1, v2, v3, v4, v5, v6; + int remaining = l & 1; + l = l >> 1; +#if defined(_WIN64) + long long tmp; +#else + long tmp; +#endif + const int *ixp = ix+j; + const FLOAT *xrp = cod_info->xr+j; + j += 4*l; + v5 = _mm_set_ss(step); + __asm__ __volatile__ ( + "xorps %5, %5 \n\t" + "testl %6, %6 \n\t" + "jz 2f \n\t" + "shufps $0x00, %4, %4 \n\t" + "1: \n\t" + +#if defined(__x86_64__) + "movslq (%7), %9 \n\t" + "movss (%10,%9,4), %2 \n\t" + "movslq 4(%7), %9 \n\t" + "movss (%10,%9,4), %1 \n\t" + "movslq 8(%7), %9 \n\t" + "movss (%10,%9,4), %0 \n\t" + "movslq 12(%7), %9 \n\t" + "movss (%10,%9,4), %3 \n\t" +#else + "movl (%7), %9 \n\t" + "movss (%10,%9,4), %2 \n\t" + "movl 4(%7), %9 \n\t" + "movss (%10,%9,4), %1 \n\t" + "movl 8(%7), %9 \n\t" + "movss (%10,%9,4), %0 \n\t" + "movl 12(%7), %9 \n\t" + "movss (%10,%9,4), %3 \n\t" +#endif + "movlhps %1, %2 \n\t" + "movlhps %3, %0 \n\t" + "shufps $0x88, %0, %2 \n\t" + "mulps %4, %2 \n\t" + + "movups (%8), %0 \n\t" + "xorps %1, %1 \n\t" + "subps %0, %1 \n\t" + "maxps %1, %0 \n\t" + "subps %2, %0 \n\t" + "mulps %0, %0 \n\t" + "addps %0, %5 \n\t" + +#if defined(__x86_64__) + "addq $16, %7 \n\t" + "addq $16, %8 \n\t" +#else + "addl $16, %7 \n\t" + "addl $16, %8 \n\t" +#endif + "decl %6 \n\t" + "jnz 1b \n\t" + "movhlps %5, %0 \n\t" + "addps %0, %5 \n\t" +#if defined(__SSE3__) + "haddps %5, %5 \n\t" +#else + "movaps %5, %0 \n\t" + "shufps $0x01, %0, %0 \n\t" + "addps %0, %5 \n\t" +#endif + "2: \n\t" + : "=x" (v1), "=x" (v2), "=x" (v3), "=x" (v4), "+x" (v5), "=x" (v6), + "+r" (l), "+r" (ixp), "+r" (xrp), "=&r" (tmp) + : "r" (pow43) + ); + _mm_store_ss(&noise, v6); + if (remaining) { + FLOAT temp; + temp = fabs(cod_info->xr[j]) - pow43[ix[j]] * step; + j++; + noise += temp * temp; + temp = fabs(cod_info->xr[j]) - pow43[ix[j]] * step; + j++; + noise += temp * temp; + } +#else while (l--) { FLOAT temp; temp = fabs(cod_info->xr[j]) - pow43[ix[j]] * step; @@ -790,6 +945,7 @@ calc_noise_core_c(const gr_info * const j++; noise += temp * temp; } +#endif } *startline = j; --- libmp3lame/takehiro.c.orig 2017-09-07 04:33:36.000000000 +0900 +++ libmp3lame/takehiro.c 2017-10-14 12:03:44.000000000 +0900 @@ -26,6 +26,7 @@ # include #endif +#undef TAKEHIRO_IEEE754_HACK #include "lame.h" #include "machine.h" @@ -34,6 +35,9 @@ #include "quantize_pvt.h" #include "tables.h" +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) +#include +#endif static const struct { const int region0_count; @@ -229,6 +233,57 @@ quantize_lines_xrpow(unsigned int l, FLO l = l >> 1; remaining = l % 2; l = l >> 1; +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) + __m128 v1, v2, v3, v4, v5, v6; + v6 = _mm_set_ss(istep); +#if defined(_WIN64) + long long tmp; +#else + long tmp; +#endif + __asm__ __volatile__ ( + "testl %6, %6 \n\t" + "jz 2f \n\t" + "shufps $0x00, %5, %5 \n\t" + "1: \n\t" + "movups (%7), %0 \n\t" + "mulps %5, %0 \n\t" + + "cvttss2si %0, %9 \n\t" + "movaps %0, %1 \n\t" + "shufps $0xe5, %1, %1 \n\t" + "movss (%10,%9,4), %2 \n\t" + "cvttss2si %1, %9 \n\t" + "movhlps %1, %1 \n\t" + "movss (%10,%9,4), %3 \n\t" + "cvttss2si %1, %9 \n\t" + "shufps $0x55, %1, %1 \n\t" + "movss (%10,%9,4), %4 \n\t" + "cvttss2si %1, %9 \n\t" + "movlhps %3, %2 \n\t" + "movss (%10,%9,4), %3 \n\t" + "movlhps %3, %4 \n\t" + "shufps $0x88, %4, %2 \n\t" + "addps %2, %0 \n\t" + "cvttps2dq %0, %0 \n\t" + "movups %0, (%8) \n\t" + +#if defined(__x86_64__) + "addq $16, %7 \n\t" + "addq $16, %8 \n\t" +#else + "addl $16, %7 \n\t" + "addl $16, %8 \n\t" +#endif + "decl %6 \n\t" + "jnz 1b \n\t" + "2: \n\t" + : "=x" (v1), "=x" (v2), "=x" (v3), "=x" (v4), "=x" (v5), "+x" (v6), + "+r" (l), "+r" (xr), "+r" (ix), "=&r" (tmp) + : "r" (adj43) + : "memory" + ); +#else while (l--) { FLOAT x0, x1, x2, x3; int rx0, rx1, rx2, rx3; @@ -250,6 +305,7 @@ quantize_lines_xrpow(unsigned int l, FLO XRPOW_FTOI(x2, *ix++); XRPOW_FTOI(x3, *ix++); }; +#endif if (remaining) { FLOAT x0, x1; int rx0, rx1; @@ -423,6 +479,80 @@ quantize_xrpow(const FLOAT * xp, int *pi static int ix_max(const int *ix, const int *end) { +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) + __m128 v1, v2, v3; + int max; + __asm__ __volatile__ ( + "pxor %2, %2 \n\t" +#if defined(__x86_64__) + "subq $8, %4 \n\t" + "cmpq %4, %3 \n\t" +#else + "subl $8, %4 \n\t" + "cmpl %4, %3 \n\t" +#endif + "je 2f \n\t" + "1: \n\t" + "movups (%3), %0 \n\t" +#if defined(__SSE4_1__) + "pmaxud %0, %2 \n\t" +#else + "movdqa %2, %1 \n\t" + "pcmpgtd %0, %2 \n\t" + "pand %2, %1 \n\t" + "pandn %0, %2 \n\t" + "por %1, %2 \n\t" +#endif +#if defined(__x86_64__) + "addq $16, %3 \n\t" + "cmpq %4, %3 \n\t" +#else + "addl $16, %3 \n\t" + "cmpl %4, %3 \n\t" +#endif + "jl 1b \n\t" + "jne 3f \n\t" + "2: \n\t" + "movq (%3), %0 \n\t" +#if defined(__SSE4_1__) + "pmaxud %0, %2 \n\t" +#else + "movdqa %2, %1 \n\t" + "pcmpgtd %0, %2 \n\t" + "pand %2, %1 \n\t" + "pandn %0, %2 \n\t" + "por %1, %2 \n\t" +#endif + "3: \n\t" +#if defined(__SSE4_1__) + "movdqa %2, %0 \n\t" + "psrldq $8, %2 \n\t" + "pmaxud %0, %2 \n\t" + "movdqa %2, %0 \n\t" + "psrldq $4, %2 \n\t" + "pmaxud %2, %0 \n\t" +#else + "movdqa %2, %0 \n\t" + "movdqa %2, %1 \n\t" + "psrldq $8, %2 \n\t" + "pcmpgtd %2, %0 \n\t" + "pand %0, %1 \n\t" + "pandn %2, %0 \n\t" + "por %1, %0 \n\t" + "movdqa %0, %2 \n\t" + "movdqa %0, %1 \n\t" + "psrldq $4, %2 \n\t" + "pcmpgtd %2, %0 \n\t" + "pand %0, %1 \n\t" + "pandn %2, %0 \n\t" + "por %1, %0 \n\t" +#endif + "movd %0, %5 \n\t" + : "=x" (v1), "=x" (v2), "=x" (v3), + "+r" (ix), "+r" (end), "=r" (max) + ); + return max; +#else int max1 = 0, max2 = 0; do { @@ -437,6 +567,7 @@ ix_max(const int *ix, const int *end) if (max1 < max2) max1 = max2; return max1; +#endif } @@ -447,12 +578,74 @@ ix_max(const int *ix, const int *end) static int -count_bit_ESC(const int *ix, const int *const end, int t1, const int t2, unsigned int *const s) +count_bit_ESC(const int *ix, const int *end, int t1, const int t2, unsigned int *const s) { /* ESC-table is used */ unsigned int const linbits = ht[t1].xlen * 65536u + ht[t2].xlen; unsigned int sum = 0, sum2; +#if defined(__GNUC__) && (defined(__x86_64__)) + unsigned int tmp; + static short mult[8] __attribute__ ((aligned (16))) = {16, 1, 16, 1, 16, 1, 16, 1}; + __asm__ __volatile__ ( + "movaps (%6), %%xmm4 \n\t" + "pcmpeqd %%xmm2, %%xmm2 \n\t" + "movdqa %%xmm2, %%xmm3 \n\t" + "psrlw $13, %%xmm2 \n\t" + "psllw $4, %%xmm3 \n\t" + "psllw $1, %%xmm2 \n\t" + "pxor %%xmm5, %%xmm5 \n\t" + "subq $8, %3 \n\t" + "cmpq %3, %0 \n\t" + "je 2f \n\t" + + "1: \n\t" + "movups (%0), %%xmm0 \n\t" + "packssdw %%xmm0, %%xmm0 \n\t" + "movdqa %%xmm0, %%xmm1 \n\t" + "paddusw %%xmm3, %%xmm1 \n\t" + "pcmpgtw %%xmm2, %%xmm0 \n\t" + "psubw %%xmm0, %%xmm5 \n\t" + "pmaddwd %%xmm4, %%xmm1 \n\t" + "movd %%xmm1, %2 \n\t" + "psrlq $32, %%xmm1 \n\t" + "cltq \n\t" + "addl 1088(%5,%%rax,4), %1 \n\t" + "movd %%xmm1, %2 \n\t" + "cltq \n\t" + "addl 1088(%5,%%rax,4), %1 \n\t" + "addq $16, %0 \n\t" + "cmpq %3, %0 \n\t" + "jl 1b \n\t" + "movdqa %%xmm5, %%xmm0 \n\t" + "psrlq $32, %%xmm0 \n\t" + "paddw %%xmm0, %%xmm5 \n\t" + "jne 3f \n\t" + + "2: \n\t" + "movq (%0), %%xmm0 \n\t" + "packssdw %%xmm0, %%xmm0 \n\t" + "movdqa %%xmm0, %%xmm1 \n\t" + "paddusw %%xmm3, %%xmm1 \n\t" + "pcmpgtw %%xmm2, %%xmm0 \n\t" + "psubw %%xmm0, %%xmm5 \n\t" + "pmaddwd %%xmm4, %%xmm1 \n\t" + "movd %%xmm1, %2 \n\t" + "cltq \n\t" + "addl 1088(%5,%%rax,4), %1 \n\t" + + "3: \n\t" + "movdqa %%xmm5, %%xmm0 \n\t" + "psrld $16, %%xmm0 \n\t" + "paddw %%xmm5, %%xmm0 \n\t" + "pextrw $0, %%xmm0, %2 \n\t" + "imull %4, %2 \n\t" + "addl %2, %1 \n\t" + : "+r" (ix), "+r" (sum), "=&a" (tmp), "+r" (end) + : "r" (linbits), "r" (largetbl), "r" (mult) + : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +#else do { unsigned int x = *ix++; unsigned int y = *ix++; @@ -469,6 +662,7 @@ count_bit_ESC(const int *ix, const int * x += y; sum += largetbl[x]; } while (ix < end); +#endif sum2 = sum & 0xffffu; sum >>= 16u; @@ -790,10 +984,178 @@ count_bits(lame_internal_flags const *co j += width; } else { +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) + __m128 v1, v2, v3, v4, v5; +#if defined(__x86_64__) +#if defined(_WIN64) + long long k = j; +#else + long k = j; +#endif + j += width; +#if defined(_WIN64) + long long l = j; +#else + long l = j; +#endif + v1 = _mm_set_ss(roundfac); + __asm__ __volatile__ ( + "shufps $0x00, %0, %0 \n\t" + + "testq $0x3, %5 \n\t" + "jz 7f \n\t" + "cmpq %6, %5 \n\t" + "je 6f \n\t" + "8: \n\t" + "movss (%7,%5,4), %1 \n\t" + "cmpnltss %0, %1 \n\t" + "movss (%8,%5,4), %2 \n\t" + "andps %2, %1 \n\t" + "movss %1, (%8,%5,4) \n\t" + "incq %5 \n\t" + "testq $0x3, %5 \n\t" + "jz 7f \n\t" + "cmpq %6, %5 \n\t" + "jne 8b \n\t" + "7: \n\t" + + "subq $8, %6 \n\t" + "cmpq %6, %5 \n\t" + "jg 2f \n\t" + "1: \n\t" + "movaps (%7,%5,4), %1 \n\t" + "movaps 16(%7,%5,4), %2 \n\t" + "cmpnltps %0, %1 \n\t" + "cmpnltps %0, %2 \n\t" + "movaps (%8,%5,4), %3 \n\t" + "movaps 16(%8,%5,4), %4 \n\t" + "andps %3, %1 \n\t" + "andps %4, %2 \n\t" + "movaps %1, (%8,%5,4) \n\t" + "movaps %2, 16(%8,%5,4) \n\t" + "addq $8, %5 \n\t" + "cmpq %6, %5 \n\t" + "jle 1b \n\t" + "2: \n\t" + "addq $8, %6 \n\t" + "cmpq %6, %5 \n\t" + "je 6f \n\t" + + "subq $4, %6 \n\t" + "cmpq %6, %5 \n\t" + "jg 4f \n\t" + "3: \n\t" + "movaps (%7,%5,4), %1 \n\t" + "cmpnltps %0, %1 \n\t" + "movaps (%8,%5,4), %2 \n\t" + "andps %2, %1 \n\t" + "movaps %1, (%8,%5,4) \n\t" + "addq $4, %5 \n\t" + "cmpq %6, %5 \n\t" + "jle 3b \n\t" + "4: \n\t" + "addq $4, %6 \n\t" + "cmpq %6, %5 \n\t" + "je 6f \n\t" + + "5: \n\t" + "movss (%7,%5,4), %1 \n\t" + "cmpnltss %0, %1 \n\t" + "movss (%8,%5,4), %2 \n\t" + "andps %2, %1 \n\t" + "movss %1, (%8,%5,4) \n\t" + "incq %5 \n\t" + "cmpq %6, %5 \n\t" + "jne 5b \n\t" + "6: \n\t" + : "+x" (v1), "=x" (v2), "=x" (v3), "=x" (v4), "=x" (v5), + "+r" (k), "+r" (l) + : "r" (xr), "r" (ix) + ); +#else + int k = j; + j += width; + v1 = _mm_set_ss(roundfac); + __asm__ __volatile__ ( + "shufps $0x00, %0, %0 \n\t" + + "testl $0x3, %5 \n\t" + "jz 7f \n\t" + "cmpl %6, %5 \n\t" + "je 6f \n\t" + "8: \n\t" + "movss (%7,%5,4), %1 \n\t" + "cmpnltss %0, %1 \n\t" + "movss (%8,%5,4), %2 \n\t" + "andps %2, %1 \n\t" + "movss %1, (%8,%5,4) \n\t" + "incl %5 \n\t" + "testl $0x3, %5 \n\t" + "jz 7f \n\t" + "cmpl %6, %5 \n\t" + "jne 8b \n\t" + "7: \n\t" + + "subl $8, %6 \n\t" + "cmpl %6, %5 \n\t" + "jg 2f \n\t" + "1: \n\t" + "movaps (%7,%5,4), %1 \n\t" + "movaps 16(%7,%5,4), %2 \n\t" + "cmpnltps %0, %1 \n\t" + "cmpnltps %0, %2 \n\t" + "movaps (%8,%5,4), %3 \n\t" + "movaps 16(%8,%5,4), %4 \n\t" + "andps %3, %1 \n\t" + "andps %4, %2 \n\t" + "movaps %1, (%8,%5,4) \n\t" + "movaps %2, 16(%8,%5,4) \n\t" + "addl $8, %5 \n\t" + "cmpl %6, %5 \n\t" + "jle 1b \n\t" + "2: \n\t" + "addl $8, %6 \n\t" + "cmpl %6, %5 \n\t" + "je 6f \n\t" + + "subl $4, %6 \n\t" + "cmpl %6, %5 \n\t" + "jg 4f \n\t" + "3: \n\t" + "movaps (%7,%5,4), %1 \n\t" + "cmpnltps %0, %1 \n\t" + "movaps (%8,%5,4), %2 \n\t" + "andps %2, %1 \n\t" + "movaps %1, (%8,%5,4) \n\t" + "addl $4, %5 \n\t" + "cmpl %6, %5 \n\t" + "jle 3b \n\t" + "4: \n\t" + "addl $4, %6 \n\t" + "cmpl %6, %5 \n\t" + "je 6f \n\t" + + "5: \n\t" + "movss (%7,%5,4), %1 \n\t" + "cmpnltss %0, %1 \n\t" + "movss (%8,%5,4), %2 \n\t" + "andps %2, %1 \n\t" + "movss %1, (%8,%5,4) \n\t" + "incl %5 \n\t" + "cmpl %6, %5 \n\t" + "jne 5b \n\t" + "6: \n\t" + : "+x" (v1), "=x" (v2), "=x" (v3), "=x" (v4), "=x" (v5), + "+r" (k), "+r" (j) + : "r" (xr), "r" (ix) + ); +#endif +#else int k; for (k = j, j += width; k < j; ++k) { ix[k] = (xr[k] >= roundfac) ? ix[k] : 0; } +#endif } } } --- libmp3lame/util.c.orig 2017-09-07 04:33:36.000000000 +0900 +++ libmp3lame/util.c 2017-10-14 12:03:44.000000000 +0900 @@ -140,7 +140,7 @@ freegfc(lame_internal_flags * const gfc) gfc->VBR_seek_table.size = 0; } if (gfc->ATH) { - free(gfc->ATH); + free_aligned16(gfc->ATH); } if (gfc->sv_rpg.rgdata) { free(gfc->sv_rpg.rgdata); @@ -162,7 +162,7 @@ freegfc(lame_internal_flags * const gfc) free_global_data(gfc); - free(gfc); + free_aligned16(gfc); } void --- libmp3lame/util.h.orig 2017-09-07 04:33:36.000000000 +0900 +++ libmp3lame/util.h 2017-10-14 12:03:44.000000000 +0900 @@ -116,6 +116,36 @@ extern "C" { typedef struct plotting_data plotting_data; #endif +#if defined(__APPLE__) +#define malloc_aligned16(size) malloc(size) +#define calloc_aligned16(n, size) calloc(n, size) +#define free_aligned16(ptr) free(ptr) +#elif defined(__MINGW32__) || defined(__MINGW64__) || defined(_MSC_VER) +#define malloc_aligned16(size) _aligned_malloc(size, 16) +static inline void * calloc_aligned16(size_t n, size_t size) +{ + void *ptr = _aligned_malloc(n * size, 16); + if(ptr) memset(ptr, 0, n * size); + return ptr; +} +#define free_aligned16(ptr) _aligned_free(ptr) +#else +static inline void * malloc_aligned16(size_t size) +{ + void *ptr; + int ret = posix_memalign(&ptr, 16, size); + return ret == 0 ? ptr : NULL; +} +static inline void * calloc_aligned16(size_t n, size_t size) +{ + void *ptr; + int ret = posix_memalign(&ptr, 16, n * size); + if(!ret) memset(ptr, 0, n * size); + return ret == 0 ? ptr : NULL; +} +#define free_aligned16(ptr) free(ptr) +#endif + /*********************************************************************** * * Global Type Definitions @@ -178,7 +208,7 @@ extern "C" { FLOAT psfb12[PSFB12]; /* ATH for partitionned sfb12 in short blocks */ FLOAT cb_l[CBANDS]; /* ATH for long block convolution bands */ FLOAT cb_s[CBANDS]; /* ATH for short block convolution bands */ - FLOAT eql_w[BLKSIZE / 2]; /* equal loudness weights (based on ATH) */ + FLOAT eql_w[BLKSIZE / 2] __attribute__ ((aligned (16))); /* equal loudness weights (based on ATH) */ } ATH_t; /** @@ -492,7 +522,7 @@ extern "C" { /* variables used by lame.c */ Bit_stream_struc bs; - III_side_info_t l3_side; + III_side_info_t l3_side __attribute__ ((aligned (16))); scalefac_struct scalefac_band; --- libmp3lame/vbrquantize.c.orig 2012-02-07 22:36:35.000000000 +0900 +++ libmp3lame/vbrquantize.c 2017-10-14 12:03:44.000000000 +0900 @@ -26,6 +26,7 @@ # include #endif +#undef TAKEHIRO_IEEE754_HACK #include "lame.h" #include "machine.h" @@ -34,7 +35,9 @@ #include "vbrquantize.h" #include "quantize_pvt.h" - +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) +#include +#endif struct algo_s; @@ -226,6 +229,81 @@ calc_sfb_noise_x34(const FLOAT * xr, con unsigned int i = bw >> 2u; unsigned int const remaining = (bw & 0x03u); +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) + __m128 v1, v2, v3, v4, v5, v6, v7, v8; +#if defined(_WIN64) + long long tmp; +#else + long tmp; +#endif + v6 = _mm_set_ss(sfpow34); + v7 = _mm_set_ss(sfpow); + __asm__ __volatile__ ( + "xorps %7, %7 \n\t" + "testl %8, %8 \n\t" + "jz 2f \n\t" + "shufps $0x00, %5, %5 \n\t" + "shufps $0x00, %6, %6 \n\t" + "pcmpeqd %4, %4 \n\t" + "psrld $1, %4 \n\t" + + "1: \n\t" + "movups (%10), %1 \n\t" + "mulps %5, %1 \n\t" + "cvttss2si %1, %11 \n\t" + "shufps $0xe5, %1, %1 \n\t" + "movlps (%12,%11,4), %2 \n\t" + "cvttss2si %1, %11 \n\t" + "movhlps %1, %1 \n\t" + "movhps (%12,%11,4), %2 \n\t" + "cvttss2si %1, %11 \n\t" + "shufps $0x55, %1, %1 \n\t" + "movlps (%12,%11,4), %3 \n\t" + "cvttss2si %1, %11 \n\t" + "movups (%9), %1 \n\t" + "movhps (%12,%11,4), %3 \n\t" + "mulps %6, %2 \n\t" + "mulps %6, %3 \n\t" + "movaps %2, %0 \n\t" + "shufps $0x88, %3, %0 \n\t" + "shufps $0xdd, %3, %2 \n\t" + + "andps %4, %1 \n\t" + "subps %1, %2 \n\t" + "subps %0, %1 \n\t" + "movaps %1, %0 \n\t" + "cmpltps %2, %1 \n\t" + "andps %1, %0 \n\t" + "andnps %2, %1 \n\t" + "orps %1, %0 \n\t" + "mulps %0, %0 \n\t" + "addps %0, %7 \n\t" + +#if defined(__x86_64__) + "addq $16, %9 \n\t" + "addq $16, %10 \n\t" +#else + "addl $16, %9 \n\t" + "addl $16, %10 \n\t" +#endif + "decl %8 \n\t" + "jnz 1b \n\t" + "movhlps %7, %0 \n\t" + "addps %0, %7 \n\t" +#if defined(__SSE3__) + "haddps %7, %7 \n\t" +#else + "movaps %7, %0 \n\t" + "shufps $0x01, %0, %0 \n\t" + "addps %0, %7 \n\t" +#endif + "2: \n\t" + : "=x" (v1), "=x" (v2), "=x" (v3), "=x" (v4), "=x" (v5), "+x" (v6), "+x" (v7), "=x" (v8), + "+r" (i), "+r" (xr), "+r" (xr34), "=&r" (tmp) + : "r" (pow43) + ); + _mm_store_ss(&xfsf, v8); +#else while (i-- > 0) { x[0] = sfpow34 * xr34[0]; x[1] = sfpow34 * xr34[1]; @@ -243,6 +321,7 @@ calc_sfb_noise_x34(const FLOAT * xr, con xr += 4; xr34 += 4; } +#endif if (remaining) { x[0] = x[1] = x[2] = x[3] = 0; switch( remaining ) {