# https://github.com/classilla/lamevmx # See also: https://tmkk.undo.jp/lame/lame-3.100-altivec-20171014.diff diff --git README.md README.md new file mode 100644 index 0000000..a82835a --- /dev/null +++ README.md @@ -0,0 +1,14 @@ +# [LAMEVMX: LAME Ain't an MP3 Encoder with VMX](http://www.floodgap.com/software/lamevmx/) + +A PowerPC-optimized build of LAME 3.100 with [tmkk's patches for AltiVec](http://tmkk.undo.jp/lame/index_e.html), enhanced with additional G5 optimizations and build-system fixes. Intended for lovely Power Macs and not icky Intel Macs, which are better served by the mainline build. Maintained by Cameron Kaiser (classilla@floodgap.com). + +How to build (GNU `make` from MacPorts strongly recommended): + +* Have a 10.4 system with Xcode 2.5. (It may or may not work on 10.5 with Xcode 3. It probably doesn't work on 10.6. It will *not* work on 10.7+.) +* Clone it. +* `./configure` +* `make` or `gmake` + +You will have a three-headed multi-architecture binary in `frontend/lame` with versions for G3, G4 and G5 processors. The same binary runs on all systems. Do `gmake test` for a quick test of functionality. + +On my Quad G5 (2.5GHz), LAMEVMX achieves approximately 25x playback speed at peak. diff --git configure configure index 52dbf02..1e34a9b 100755 --- configure +++ configure @@ -7616,6 +7616,7 @@ IFS=$as_save_IFS fi fi +ac_cv_prog_ac_ct_NMEDIT="true" # doesn't work right on 10.4 ac_ct_NMEDIT=$ac_cv_prog_ac_ct_NMEDIT if test -n "$ac_ct_NMEDIT"; then { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_NMEDIT" >&5 @@ -8003,6 +8004,7 @@ if ac_fn_c_try_link "$LINENO"; then : else lt_cv_ld_exported_symbols_list=no fi +lt_cv_ld_exported_symbols_list=no # doesn't work right on 10.4 ld rm -f core conftest.err conftest.$ac_objext \ conftest$ac_exeext conftest.$ac_ext LDFLAGS=$save_LDFLAGS @@ -17438,7 +17440,6 @@ if test "x$HAVE_GCC" = "xyes" -o "x$HAVE_CLANG" = "xyes"; then OPTIMIZATION_NORM="-fschedule-insns2" fi - # generic CPU specific options case ${host_cpu} in sparc) @@ -17604,9 +17605,7 @@ else $as_echo "no" >&6; } fi - - - +OPTIMIZATION="-arch ppc750 -arch ppc7400 -arch ppc970 -O3 -fomit-frame-pointer -ffast-math -funroll-loops -isysroot @SYSROOT@" { $as_echo "$as_me:${as_lineno-$LINENO}: checking for debug options" >&5 $as_echo_n "checking for debug options... " >&6; } diff --git frontend/Makefile.in frontend/Makefile.in index 4f15e55..261d7dd 100644 --- frontend/Makefile.in +++ frontend/Makefile.in @@ -464,17 +464,18 @@ clean-binPROGRAMS: echo " rm -f" $$list; \ rm -f $$list +# The Universal build does not work against the ar-static libs. lame$(EXEEXT): $(lame_OBJECTS) $(lame_DEPENDENCIES) $(EXTRA_lame_DEPENDENCIES) @rm -f lame$(EXEEXT) - $(AM_V_CCLD)$(LINK) $(lame_OBJECTS) $(lame_LDADD) $(LIBS) + $(AM_V_CCLD)$(LINK) $(lame_OBJECTS) ../libmp3lame/.libs/*.o ../mpglib/*.o -lncurses -liconv -lm mp3rtp$(EXEEXT): $(mp3rtp_OBJECTS) $(mp3rtp_DEPENDENCIES) $(EXTRA_mp3rtp_DEPENDENCIES) @rm -f mp3rtp$(EXEEXT) - $(AM_V_CCLD)$(LINK) $(mp3rtp_OBJECTS) $(mp3rtp_LDADD) $(LIBS) + $(AM_V_CCLD)$(LINK) $(mp3rtp_OBJECTS) ../libmp3lame/.libs/*.o ../mpglib/*.o -lncurses -liconv -lm mp3x$(EXEEXT): $(mp3x_OBJECTS) $(mp3x_DEPENDENCIES) $(EXTRA_mp3x_DEPENDENCIES) @rm -f mp3x$(EXEEXT) - $(AM_V_CCLD)$(LINK) $(mp3x_OBJECTS) $(mp3x_LDADD) $(LIBS) + $(AM_V_CCLD)$(LINK) $(mp3x_OBJECTS) ../libmp3lame/.libs/*.o ../mpglib/*.o -lncurses -liconv -lm mostlyclean-compile: -rm -f *.$(OBJEXT) @@ -497,22 +498,22 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/timestatus.Po@am__quote@ .c.o: -@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< -@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -c -o $@ $< +#@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po @AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ $< .c.obj: -@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'` -@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'` +#@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po @AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'` .c.lo: -@am__fastdepCC_TRUE@ $(AM_V_CC)$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< -@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo +@am__fastdepCC_TRUE@ $(AM_V_CC)$(LTCOMPILE) -c -o $@ $< +#@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo @AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LTCOMPILE) -c -o $@ $< diff --git frontend/parse.c frontend/parse.c index 752613f..6e1db2f 100644 --- frontend/parse.c +++ frontend/parse.c @@ -99,6 +99,22 @@ char *strchr(), *strrchr(); static int const lame_alpha_version_enabled = LAME_ALPHA_VERSION; static int const internal_opts_enabled = INTERNAL_OPTS; +/* 10.4 does not implement strnlen(), so ... */ +/* Find the length of S, but scan at most MAXLEN characters. If no '\0' + terminator is found within the first MAXLEN characters, return MAXLEN. */ +size_t +strnlen (s, maxlen) + register const char *s; + size_t maxlen; +{ + register const char *e; + size_t n; + + for (e = s, n = 0; *e && n < maxlen; e++, n++) + ; + return n; +} + /* GLOBAL VARIABLES. set by parse_args() */ /* we need to clean this up */ diff --git libmp3lame/Makefile.in libmp3lame/Makefile.in index 5437b38..27acde2 100644 --- libmp3lame/Makefile.in +++ libmp3lame/Makefile.in @@ -577,22 +577,22 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/version.Plo@am__quote@ .c.o: -@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< -@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -c -o $@ $< +#@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po @AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ $< .c.obj: -@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'` -@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'` +#@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po @AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'` .c.lo: -@am__fastdepCC_TRUE@ $(AM_V_CC)$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< -@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo +@am__fastdepCC_TRUE@ $(AM_V_CC)$(LTCOMPILE) -c -o $@ $< +#@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo @AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LTCOMPILE) -c -o $@ $< diff --git libmp3lame/fft.c libmp3lame/fft.c index 4eea1ad..19863d1 100644 --- libmp3lame/fft.c +++ libmp3lame/fft.c @@ -38,6 +38,10 @@ # include #endif +#if __ALTIVEC__ +#include +#endif + #include "lame.h" #include "machine.h" #include "encoder.h" @@ -66,6 +70,17 @@ fht(FLOAT * fz, int n) int k4; FLOAT *fi, *gi; FLOAT const *fn; +#if __ALTIVEC__ + float csvec[16] __attribute__ ((aligned (16))); + vector float v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15,v16; + vector float vfi0,vfi1,vfi2,vfi3,vgi0,vgi1,vgi2,vgi3,vf0,vf1,vf2,vf3,vg0,vg1,vg2,vg3; + vector float vprev1,vprev2,vprev3,vprev4,vc1,vc2,vs1,vs2,vzero; + vector unsigned char vperm1,vperm2; + + vperm1 = (vector unsigned char)VINIT16(16,17,18,19,12,13,14,15,8,9,10,11,4,5,6,7); + vperm2 = (vector unsigned char)VINIT16(16,17,18,19,4,5,6,7,8,9,10,11,12,13,14,15); + vzero = vec_xor(vzero,vzero); +#endif n <<= 1; /* to get BLKSIZE, because of 3DNow! ASM routine */ fn = fz + n; @@ -103,6 +118,238 @@ fht(FLOAT * fz, int n) } while (fi < fn); c1 = tri[0]; s1 = tri[1]; +#if __ALTIVEC__ + if(kx < 4) { + for (i = 1; i < kx; i++) { + FLOAT c2, s2; + c2 = 1 - (2 * s1) * s1; + s2 = (2 * s1) * c1; + fi = fz + i; + gi = fz + k1 - i; + do { + FLOAT a, b, g0, f0, f1, g1, f2, g2, f3, g3; + b = s2 * fi[k1] - c2 * gi[k1]; + a = c2 * fi[k1] + s2 * gi[k1]; + f1 = fi[0] - a; + f0 = fi[0] + a; + g1 = gi[0] - b; + g0 = gi[0] + b; + b = s2 * fi[k3] - c2 * gi[k3]; + a = c2 * fi[k3] + s2 * gi[k3]; + f3 = fi[k2] - a; + f2 = fi[k2] + a; + g3 = gi[k2] - b; + g2 = gi[k2] + b; + b = s1 * f2 - c1 * g3; + a = c1 * f2 + s1 * g3; + fi[k2] = f0 - a; + fi[0] = f0 + a; + gi[k3] = g1 - b; + gi[k1] = g1 + b; + b = c1 * g2 - s1 * f3; + a = s1 * g2 + c1 * f3; + gi[k2] = g0 - a; + gi[0] = g0 + a; + fi[k3] = f1 - b; + fi[k1] = f1 + b; + gi += k4; + fi += k4; + } while (fi < fn); + c2 = c1; + c1 = c2 * tri[0] - s1 * tri[1]; + s1 = c2 * tri[1] + s1 * tri[0]; + } + } + else { + FLOAT c2, s2; + for(i = 1; i < 4; i++) { + c2 = 1 - (2*s1)*s1; + s2 = (2*s1)*c1; + csvec[i] = c1; + csvec[i+4] = c2; + csvec[i+8] = s1; + csvec[i+12] = s2; + c2 = c1; + c1 = c2 * tri[0] - s1 * tri[1]; + s1 = c2 * tri[1] + s1 * tri[0]; + } + vc1 = vec_ld(0,csvec); + vc2 = vec_ld(16,csvec); + vs1 = vec_ld(32,csvec); + vs2 = vec_ld(48,csvec); + fi = fz; + gi = fz + k1; + do { + vfi0 = vec_ld(0,fi); + vfi1 = vec_ld(0,fi+k1); + vfi2 = vec_ld(0,fi+k2); + vfi3 = vec_ld(0,fi+k3); + vprev1 = vec_ld(0,gi-4); + vprev2 = vec_ld(0,gi+k1-4); + vprev3 = vec_ld(0,gi+k2-4); + vprev4 = vec_ld(0,gi+k3-4); + vgi0 = vec_perm(vprev1,vprev1,vperm1); + vgi1 = vec_perm(vprev2,vprev2,vperm1); + vgi2 = vec_perm(vprev3,vprev3,vperm1); + vgi3 = vec_perm(vprev4,vprev4,vperm1); + + v1 = vec_madd(vfi1,vc2,vzero); + v2 = vec_madd(vfi1,vs2,vzero); + v3 = vec_madd(vfi3,vc2,vzero); + v4 = vec_madd(vfi3,vs2,vzero); + v5 = vec_madd(vgi1,vs2,v1); + v6 = vec_nmsub(vgi1,vc2,v2); + v7 = vec_madd(vgi3,vs2,v3); + v8 = vec_nmsub(vgi3,vc2,v4); + + vf0 = vec_add(vfi0,v5); + vf1 = vec_sub(vfi0,v5); + vg0 = vec_add(vgi0,v6); + vg1 = vec_sub(vgi0,v6); + vf2 = vec_add(vfi2,v7); + vf3 = vec_sub(vfi2,v7); + vg2 = vec_add(vgi2,v8); + vg3 = vec_sub(vgi2,v8); + + v1 = vec_madd(vf2,vc1,vzero); + v2 = vec_madd(vf2,vs1,vzero); + v3 = vec_madd(vg2,vs1,vzero); + v4 = vec_madd(vg2,vc1,vzero); + v5 = vec_madd(vg3,vs1,v1); + v6 = vec_nmsub(vg3,vc1,v2); + v7 = vec_madd(vf3,vc1,v3); + v8 = vec_nmsub(vf3,vs1,v4); + + v9 = vec_add(vf0,v5); + v10 = vec_sub(vf0,v5); + v11 = vec_add(vg1,v6); + v12 = vec_sub(vg1,v6); + v13 = vec_add(vg0,v7); + v14 = vec_sub(vg0,v7); + v15 = vec_add(vf1,v8); + v16 = vec_sub(vf1,v8); + + v1 = vec_perm(v9,vfi0,vperm2); + v2 = vec_perm(v10,vfi2,vperm2); + v3 = vec_perm(v15,vfi1,vperm2); + v4 = vec_perm(v16,vfi3,vperm2); + vec_st(v1,0,fi); + vec_st(v2,0,fi+k2); + vec_st(v3,0,fi+k1); + vec_st(v4,0,fi+k3); + + v1 = vec_perm(v11,vprev2,vperm1); + v2 = vec_perm(v12,vprev4,vperm1); + v3 = vec_perm(v13,vprev1,vperm1); + v4 = vec_perm(v14,vprev3,vperm1); + vec_st(v1,0,gi+k1-4); + vec_st(v2,0,gi+k3-4); + vec_st(v3,0,gi-4); + vec_st(v4,0,gi+k2-4); + + gi += k4; + fi += k4; + } while (fi #endif +#if __ALTIVEC__ +#include +#endif + #include #include #include @@ -109,6 +113,67 @@ /*lint -save -e736 loss of precision */ +#if __ALTIVEC__ +static const Float_t ABYule[9][2 * YULE_ORDER + 1 + 3] __attribute__ ((aligned (16))) = { + {0.03857599435200, -3.84664617118067, -0.02160367184185, 7.81501653005538, -0.00123395316851, + -11.34170355132042, -0.00009291677959, 13.05504219327545, -0.01655260341619, + -12.28759895145294, 0.02161526843274, 9.48293806319790, -0.02074045215285, -5.87257861775999, + 0.00594298065125, 2.75465861874613, 0.00306428023191, -0.86984376593551, 0.00012025322027, + 0.13919314567432, 0.00288463683916, 0.0, 0.0, 0.0}, + {0.05418656406430, -3.47845948550071, -0.02911007808948, 6.36317777566148, -0.00848709379851, + -8.54751527471874, -0.00851165645469, 9.47693607801280, -0.00834990904936, -8.81498681370155, + 0.02245293253339, 6.85401540936998, -0.02596338512915, -4.39470996079559, 0.01624864962975, + 2.19611684890774, -0.00240879051584, -0.75104302451432, 0.00674613682247, 0.13149317958808, + -0.00187763777362, 0.0, 0.0, 0.0}, + {0.15457299681924, -2.37898834973084, -0.09331049056315, 2.84868151156327, -0.06247880153653, + -2.64577170229825, 0.02163541888798, 2.23697657451713, -0.05588393329856, -1.67148153367602, + 0.04781476674921, 1.00595954808547, 0.00222312597743, -0.45953458054983, 0.03174092540049, + 0.16378164858596, -0.01390589421898, -0.05032077717131, 0.00651420667831, 0.02347897407020, + -0.00881362733839, 0.0, 0.0, 0.0}, + {0.30296907319327, -1.61273165137247, -0.22613988682123, 1.07977492259970, -0.08587323730772, + -0.25656257754070, 0.03282930172664, -0.16276719120440, -0.00915702933434, -0.22638893773906, + -0.02364141202522, 0.39120800788284, -0.00584456039913, -0.22138138954925, 0.06276101321749, + 0.04500235387352, -0.00000828086748, 0.02005851806501, 0.00205861885564, 0.00302439095741, + -0.02950134983287, 0.0, 0.0, 0.0}, + {0.33642304856132, -1.49858979367799, -0.25572241425570, 0.87350271418188, -0.11828570177555, + 0.12205022308084, 0.11921148675203, -0.80774944671438, -0.07834489609479, 0.47854794562326, + -0.00469977914380, -0.12453458140019, -0.00589500224440, -0.04067510197014, 0.05724228140351, + 0.08333755284107, 0.00832043980773, -0.04237348025746, -0.01635381384540, 0.02977207319925, + -0.01760176568150, 0.0, 0.0, 0.0}, + {0.44915256608450, -0.62820619233671, -0.14351757464547, 0.29661783706366, -0.22784394429749, + -0.37256372942400, -0.01419140100551, 0.00213767857124, 0.04078262797139, -0.42029820170918, + -0.12398163381748, 0.22199650564824, 0.04097565135648, 0.00613424350682, 0.10478503600251, + 0.06747620744683, -0.01863887810927, 0.05784820375801, -0.03193428438915, 0.03222754072173, + 0.00541907748707, 0.0, 0.0, 0.0}, + {0.56619470757641, -1.04800335126349, -0.75464456939302, 0.29156311971249, 0.16242137742230, + -0.26806001042947, 0.16744243493672, 0.00819999645858, -0.18901604199609, 0.45054734505008, + 0.30931782841830, -0.33032403314006, -0.27562961986224, 0.06739368333110, 0.00647310677246, + -0.04784254229033, 0.08647503780351, 0.01639907836189, -0.03788984554840, 0.01807364323573, + -0.00588215443421, 0.0, 0.0, 0.0}, + {0.58100494960553, -0.51035327095184, -0.53174909058578, -0.31863563325245, -0.14289799034253, + -0.20256413484477, 0.17520704835522, 0.14728154134330, 0.02377945217615, 0.38952639978999, + 0.15558449135573, -0.23313271880868, -0.25344790059353, -0.05246019024463, 0.01628462406333, + -0.02505961724053, 0.06920467763959, 0.02442357316099, -0.03721611395801, 0.01818801111503, + -0.00749618797172, 0.0, 0.0, 0.0}, + {0.53648789255105, -0.25049871956020, -0.42163034350696, -0.43193942311114, -0.00275953611929, + -0.03424681017675, 0.04267842219415, -0.04678328784242, -0.10214864179676, 0.26408300200955, + 0.14590772289388, 0.15113130533216, -0.02459864859345, -0.17556493366449, -0.11202315195388, + -0.18823009262115, -0.04060034127000, 0.05477720428674, 0.04788665548180, 0.04704409688120, + -0.02217936801134, 0.0, 0.0, 0.0} +}; + +static const Float_t ABButter[9][2 * BUTTER_ORDER + 1 + 3] __attribute__ ((aligned (16))) = { + {0.98621192462708, -1.97223372919527, -1.97242384925416, 0.97261396931306, 0.98621192462708, 0.0, 0.0, 0.0}, + {0.98500175787242, -1.96977855582618, -1.97000351574484, 0.97022847566350, 0.98500175787242, 0.0, 0.0, 0.0}, + {0.97938932735214, -1.95835380975398, -1.95877865470428, 0.95920349965459, 0.97938932735214, 0.0, 0.0, 0.0}, + {0.97531843204928, -1.95002759149878, -1.95063686409857, 0.95124613669835, 0.97531843204928, 0.0, 0.0, 0.0}, + {0.97316523498161, -1.94561023566527, -1.94633046996323, 0.94705070426118, 0.97316523498161, 0.0, 0.0, 0.0}, + {0.96454515552826, -1.92783286977036, -1.92909031105652, 0.93034775234268, 0.96454515552826, 0.0, 0.0, 0.0}, + {0.96009142950541, -1.91858953033784, -1.92018285901082, 0.92177618768381, 0.96009142950541, 0.0, 0.0, 0.0}, + {0.95856916599601, -1.91542108074780, -1.91713833199203, 0.91885558323625, 0.95856916599601, 0.0, 0.0, 0.0}, + {0.94597685600279, -1.88903307939452, -1.89195371200558, 0.89487434461664, 0.94597685600279, 0.0, 0.0, 0.0} +}; +#else static const Float_t ABYule[9][multiple_of(4, 2 * YULE_ORDER + 1)] = { /* 20 18 16 14 12 10 8 6 4 2 0 19 17 15 13 11 9 7 5 3 1 */ { 0.00288463683916, 0.00012025322027, 0.00306428023191, 0.00594298065125, -0.02074045215285, 0.02161526843274, -0.01655260341619, -0.00009291677959, -0.00123395316851, -0.02160367184185, 0.03857599435200, 0.13919314567432, -0.86984376593551, 2.75465861874613, -5.87257861775999, 9.48293806319790,-12.28759895145294, 13.05504219327545,-11.34170355132042, 7.81501653005538, -3.84664617118067}, @@ -134,6 +199,7 @@ static const Float_t ABButter[9][multiple_of(4, 2 * BUTTER_ORDER + 1)] = { {0.95856916599601, 0.91885558323625, -1.91713833199203, -1.91542108074780, 0.95856916599601}, {0.94597685600279, 0.89487434461664, -1.89195371200558, -1.88903307939452, 0.94597685600279} }; +#endif /*lint -restore */ @@ -143,6 +209,191 @@ static const Float_t ABButter[9][multiple_of(4, 2 * BUTTER_ORDER + 1)] = { /* When calling this procedure, make sure that ip[-order] and op[-order] point to real data! */ +#if __ALTIVEC__ + +static void +filterIntegrated (const Float_t* input, Float_t* output, Float_t* output2, size_t nSamples, const Float_t* kernel, const Float_t* kernel2) +{ + vector float v1,v2,v3,v4,v5,v6,vbase; + vector float vmask1,vmask2,vout1,vout2,vout3,vout4,vzero,vkernel1,vkernel2,vkernel3,vkernel4,vkernel5,vkernel6,vkernel7,vkernel8; + vector float vo1, vo2, vo3, vo4, vi2, vi3; + vector unsigned char vc1,vc2,vc3,vc4,vc5,vperm1,vperm2,vperm4,vperm5,vperm6; + + vbase = (vector float)VINIT4ALL(1e-10f); + vperm1 = (vector unsigned char)VINIT16(24,25,26,27,16,17,18,19,8,9,10,11,0,1,2,3); + vperm2 = (vector unsigned char)VINIT16(28,29,30,31,20,21,22,23,12,13,14,15,4,5,6,7); + vc1 = vec_splat_u8(1); + vc2 = vec_splat_u8(5); + vc3 = vec_sl(vc1,vc2); + vc4 = vec_sl(vc3,vc1); + vc5 = vec_or(vc3,vc4); + v1 = (vector float)vec_splat_s32(-1); + vmask1 = vec_sro(v1,vc3); + vmask2 = vec_sro(v1,vc4); + vzero = vec_xor(vzero,vzero); + + v1 = vec_ld(0,kernel); + v2 = vec_ld(16,kernel); + v3 = vec_ld(32,kernel); + v4 = vec_ld(48,kernel); + v5 = vec_ld(64,kernel); + v6 = vec_ld(80,kernel); + vkernel1 = vec_perm(v1,v2,vperm1); + vkernel2 = vec_perm(v1,v2,vperm2); + vkernel3 = vec_perm(v3,v4,vperm1); + vkernel4 = vec_perm(v3,v4,vperm2); + vkernel5 = vec_perm(v5,v6,vperm1); + vkernel6 = vec_perm(v5,v6,vperm2); + vkernel5 = vec_and(vkernel5,vmask1); + vkernel6 = vec_and(vkernel6,vmask2); + + v1 = vec_ld(0,kernel2); + v2 = vec_ld(16,kernel2); + vkernel7 = vec_perm(v1,v2,vperm1); + vkernel8 = vec_perm(v1,v2,vperm2); + vkernel7 = vec_and(vkernel7,vmask1); + vkernel8 = vec_and(vkernel8,vmask2); + + vperm4 = vec_lvsl(0,input-7); + vperm5 = vec_lvsl(0,output-4); + + v1 = vec_ld(15,input-7); + v2 = vec_ld(0,input-7); + v3 = vec_ld(0,input-10); + v4 = vec_ld(15,input-11); + vi2 = vec_perm(v2,v1,vperm4); + vi3 = vec_perm(v3,v4,vec_lvsl(0,input-10)); + vi3 = vec_sro(vi3,vc3); + + v1 = vec_ld(15,output-4); + v2 = vec_ld(0,output-4); + v3 = vec_ld(0,output-8); + v4 = vec_ld(0,output-10); + v5 = vec_ld(15,output-10); + vo1 = vec_perm(v2,v1,vperm5); + vo2 = vec_perm(v3,v2,vperm5); + vo3 = vec_perm(v4,v5,vec_lvsl(0,output-10)); + vo3 = vec_sro(vo3,vc4); + + v1 = vec_ld(15,output2-2); + v2 = vec_ld(0,output2-2); + vo4 = vec_perm(v2,v1,vec_lvsl(0,output2-2)); + vo4 = vec_sro(vo4,vc4); + + vperm4 = vec_lvsl(0,input-3); + vperm5 = vec_lvsr(0,output); + + /* 1st loop */ + v1 = vec_ld(15,input-3); + v3 = vec_ld(0,input-3); + v5 = vec_perm(v3,v1,vperm4); + + vout1 = vec_madd(v5,vkernel1,vbase); + vout2 = vec_madd(vo1,vkernel2,vbase); + + vout1 = vec_madd(vi2,vkernel3,vout1); + vout2 = vec_madd(vo2,vkernel4,vout2); + + vout1 = vec_madd(vi3,vkernel5,vout1); + vout2 = vec_madd(vo3,vkernel6,vout2); + + vi3 = vec_sld(vi3,vi2,4); + vi2 = vec_sld(vi2,v5,4); + + vout1 = vec_sub(vout1,vout2); + + v1 = vec_slo(vout1,vc3); + v2 = vec_slo(vout1,vc4); + v3 = vec_slo(vout1,vc5); + vout1 = vec_add(vout1,v1); + vout2 = vec_add(v2,v3); + vout1 = vec_add(vout1,vout2); + + vo3 = vec_sld(vo3,vo2,4); + vo2 = vec_sld(vo2,vo1,4); + vo1 = vec_sld(vo1,vout1,4); + + vout2 = vec_perm(vout1,vout1,vperm5); + vec_ste(vout2,0,output); + + ++output; + ++input; + --nSamples; + + while(nSamples--) { + vperm4 = vec_lvsl(0,input-3); + vperm5 = vec_lvsr(0,output); + vperm6 = vec_lvsr(0,output2); + + v1 = vec_ld(15,input-3); + v3 = vec_ld(0,input-3); + v5 = vec_perm(v3,v1,vperm4); + + vout1 = vec_madd(v5,vkernel1,vbase); + vout2 = vec_madd(vo1,vkernel2,vbase); + + vout1 = vec_madd(vi2,vkernel3,vout1); + vout2 = vec_madd(vo2,vkernel4,vout2); + + vout1 = vec_madd(vi3,vkernel5,vout1); + vout2 = vec_madd(vo3,vkernel6,vout2); + + vout3 = vec_nmsub(vo4,vkernel8,vzero); + vout4 = vec_madd(vo1,vkernel7,vout3); + + vi3 = vec_sld(vi3,vi2,4); + vi2 = vec_sld(vi2,v5,4); + + vout1 = vec_sub(vout1,vout2); + + v1 = vec_slo(vout1,vc3); + v2 = vec_slo(vout1,vc4); + v3 = vec_slo(vout1,vc5); + vout1 = vec_add(vout1,v1); + vout2 = vec_add(v2,v3); + vout1 = vec_add(vout1,vout2); + + vo3 = vec_sld(vo3,vo2,4); + vo2 = vec_sld(vo2,vo1,4); + vo1 = vec_sld(vo1,vout1,4); + + v4 = vec_slo(vout4,vc3); + v5 = vec_slo(vout4,vc4); + v6 = vec_slo(vout4,vc5); + vout4 = vec_add(vout4,v4); + vout3 = vec_add(v5,v6); + vout3 = vec_add(vout3,vout4); + + vo4 = vec_sld(vo4,vout3,4); + + vout2 = vec_perm(vout1,vout1,vperm5); + vout4 = vec_perm(vout3,vout3,vperm6); + vec_ste(vout2,0,output); + vec_ste(vout4,0,output2); + + ++output; + ++output2; + ++input; + } + + vperm6 = vec_lvsr(0,output2); + + vout3 = vec_nmsub(vo4,vkernel8,vzero); + vout4 = vec_madd(vo1,vkernel7,vout3); + + v1 = vec_slo(vout4,vc3); + v2 = vec_slo(vout4,vc4); + v3 = vec_slo(vout4,vc5); + vout4 = vec_add(vout4,v1); + vout3 = vec_add(v2,v3); + vout3 = vec_add(vout3,vout4); + + vout4 = vec_perm(vout3,vout3,vperm6); + vec_ste(vout4,0,output2); +} + +#else + static void filterYule(const Float_t * input, Float_t * output, size_t nSamples, const Float_t * const kernel) { @@ -189,7 +440,7 @@ filterButter(const Float_t * input, Float_t * output, size_t nSamples, const Flo } } - +#endif static int ResetSampleFrequency(replaygain_t * rgData, long samplefreq); @@ -323,6 +574,10 @@ AnalyzeSamples(replaygain_t * rgData, const Float_t * left_samples, const Float_ curright = right_samples + cursamplepos; } +#if __ALTIVEC__ + filterIntegrated(curleft, rgData->lstep + rgData->totsamp, rgData->lout + rgData->totsamp, cursamples, ABYule[rgData->freqindex], ABButter[rgData->freqindex]); + filterIntegrated(curright, rgData->rstep + rgData->totsamp, rgData->rout + rgData->totsamp, cursamples, ABYule[rgData->freqindex], ABButter[rgData->freqindex]); +#else YULE_FILTER(curleft, rgData->lstep + rgData->totsamp, cursamples, ABYule[rgData->freqindex]); YULE_FILTER(curright, rgData->rstep + rgData->totsamp, cursamples, @@ -332,6 +587,7 @@ AnalyzeSamples(replaygain_t * rgData, const Float_t * left_samples, const Float_ ABButter[rgData->freqindex]); BUTTER_FILTER(rgData->rstep + rgData->totsamp, rgData->rout + rgData->totsamp, cursamples, ABButter[rgData->freqindex]); +#endif curleft = rgData->lout + rgData->totsamp; /* Get the squared values */ curright = rgData->rout + rgData->totsamp; diff --git libmp3lame/lame.c libmp3lame/lame.c index cb82225..4b3290d 100644 --- libmp3lame/lame.c +++ libmp3lame/lame.c @@ -30,6 +30,9 @@ # include #endif +#if __ALTIVEC__ +#include +#endif #include "lame.h" #include "machine.h" @@ -603,7 +606,12 @@ lame_init_params(lame_global_flags * gfp) gfc->CPU_features.SSE = 0; gfc->CPU_features.SSE2 = 0; } - +#if __ALTIVEC__ + /* turn off JAVA mode explicitly */ + vector unsigned short vscr = vec_mfvscr(); + vscr = vec_or(vscr,(vector unsigned short)VINIT8(0,0,0,0,0,0,1,0)); + vec_mtvscr(vscr); +#endif cfg->vbr = gfp->VBR; cfg->error_protection = gfp->error_protection; diff --git libmp3lame/machine.h libmp3lame/machine.h index bf6fff2..4fc1e70 100644 --- libmp3lame/machine.h +++ libmp3lame/machine.h @@ -184,6 +184,24 @@ typedef FLOAT sample_t; # endif #endif +#if __ALTIVEC__ +#ifdef __APPLE_CC__ +#define VINIT4(a,b,c,d) (a,b,c,d) +#define VINIT8(a,b,c,d,e,f,g,h) (a,b,c,d,e,f,g,h) +#define VINIT16(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) (a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) +#define VINIT4ALL(a) (a,a,a,a) +#define VINIT8ALL(a) (a,a,a,a,a,a,a,a) +#define VINIT16ALL(a) (a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a) +#else +#define VINIT4(a,b,c,d) {a,b,c,d} +#define VINIT8(a,b,c,d,e,f,g,h) {a,b,c,d,e,f,g,h} +#define VINIT16(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) {a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p} +#define VINIT4ALL(a) {a,a,a,a} +#define VINIT8ALL(a) {a,a,a,a,a,a,a,a} +#define VINIT16ALL(a) {a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a} +#endif +#endif + #endif /* end of machine.h */ diff --git libmp3lame/newmdct.c libmp3lame/newmdct.c index 596cac9..328c38b 100644 --- libmp3lame/newmdct.c +++ libmp3lame/newmdct.c @@ -30,6 +30,10 @@ # include #endif +#if __ALTIVEC__ +#include +#endif + #include "lame.h" #include "machine.h" #include "encoder.h" @@ -39,7 +43,7 @@ #ifndef USE_GOGO_SUBBAND -static const FLOAT enwindow[] = { +static const FLOAT enwindow[] __attribute__ ((aligned (16))) = { -4.77e-07 * 0.740951125354959 / 2.384e-06, 1.03951e-04 * 0.740951125354959 / 2.384e-06, 9.53674e-04 * 0.740951125354959 / 2.384e-06, 2.841473e-03 * 0.740951125354959 / 2.384e-06, 3.5758972e-02 * 0.740951125354959 / 2.384e-06, 3.401756e-03 * 0.740951125354959 / 2.384e-06, 9.83715e-04 * 0.740951125354959 / 2.384e-06, 9.9182e-05 * 0.740951125354959 / 2.384e-06, /* 15 */ @@ -230,7 +234,7 @@ static const FLOAT enwindow[] = { #define NS 12 #define NL 36 -static const FLOAT win[4][NL] = { +static const FLOAT win[4][NL] __attribute__ ((aligned (16))) = { { 2.382191739347913e-13, 6.423305872147834e-13, @@ -435,6 +439,443 @@ window_subband(const sample_t * x1, FLOAT a[SBLIMIT]) const sample_t *x2 = &x1[238 - 14 - 286]; +#if __ALTIVEC__ + vector float v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15,v16; + vector float vw1,vw2,vw3,vw4,vw5,vw6,vw7,vw8,vs,vt,vzero; + vector unsigned char vperm2,vperm3,vperm4,vperm5; + vzero = vec_xor(vzero,vzero); + vperm5 = (vector unsigned char)VINIT16(12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3); + vperm2 = vec_lvsl(0,wp+8); + vperm3 = (vector unsigned char)VINIT16(0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31); + vperm4 = vec_lvsl(0,x1+1); + vperm4 = vec_perm(vperm4,vperm4,vperm5); + + for(i=0;i<3;i++) { + v1 = vec_ld(0,wp-10); + v2 = vec_ld(16,wp-10); + v5 = vec_ld(0,wp+8); + v6 = vec_ld(16,wp+8); + v7 = vec_ld(32,wp+8); + v3 = vec_ld(0,wp+26); + v4 = vec_ld(16,wp+26); + v8 = vec_ld(0,wp+44); + v9 = vec_ld(16,wp+44); + v10 = vec_ld(32,wp+44); + + v5 = vec_perm(v5,v6,vperm2); + v6 = vec_perm(v6,v7,vperm2); + v7 = vec_perm(v8,v9,vperm2); + v8 = vec_perm(v9,v10,vperm2); + v9 = vec_mergeh(v1,v3); + v10 = vec_mergeh(v2,v4); + v11 = vec_mergeh(v5,v7); + v12 = vec_mergeh(v6,v8); + v13 = vec_mergel(v1,v3); + v14 = vec_mergel(v2,v4); + v15 = vec_mergel(v5,v7); + v16 = vec_mergel(v6,v8); + vw1 = vec_mergeh(v9,v11); + vw5 = vec_mergeh(v10,v12); + vw2 = vec_mergel(v9,v11); + vw6 = vec_mergel(v10,v12); + vw3 = vec_mergeh(v13,v15); + vw7 = vec_mergeh(v14,v16); + vw4 = vec_mergel(v13,v15); + vw8 = vec_mergel(v14,v16); + + v3 = vec_ld(0,x2-224); + vs = vec_madd(vw1,v3,vzero); + v4 = vec_ld(16,x1+221); + v5 = vec_ld(0,x1+221); + v6 = vec_perm(v5,v4,vperm4); + vt = vec_madd(vw1,v6,vzero); + + v3 = vec_ld(0,x2-160); + vs = vec_madd(vw2,v3,vs); + v4 = vec_ld(16,x1+157); + v5 = vec_ld(0,x1+157); + v6 = vec_perm(v5,v4,vperm4); + vt = vec_madd(vw2,v6,vt); + + v3 = vec_ld(0,x2-96); + vs = vec_madd(vw3,v3,vs); + v4 = vec_ld(16,x1+93); + v5 = vec_ld(0,x1+93); + v6 = vec_perm(v5,v4,vperm4); + vt = vec_madd(vw3,v6,vt); + + v3 = vec_ld(0,x2-32); + vs = vec_madd(vw4,v3,vs); + v4 = vec_ld(16,x1+29); + v5 = vec_ld(0,x1+29); + v6 = vec_perm(v5,v4,vperm4); + vt = vec_madd(vw4,v6,vt); + + + v3 = vec_ld(0,x2+32); + vs = vec_madd(vw5,v3,vs); + v4 = vec_ld(16,x1-35); + v5 = vec_ld(0,x1-35); + v6 = vec_perm(v5,v4,vperm4); + vt = vec_madd(vw5,v6,vt); + + v3 = vec_ld(0,x2+96); + vs = vec_madd(vw6,v3,vs); + v4 = vec_ld(16,x1-99); + v5 = vec_ld(0,x1-99); + v6 = vec_perm(v5,v4,vperm4); + vt = vec_madd(vw6,v6,vt); + + v3 = vec_ld(0,x2+160); + vs = vec_madd(vw7,v3,vs); + v4 = vec_ld(16,x1-163); + v5 = vec_ld(0,x1-163); + v6 = vec_perm(v5,v4,vperm4); + vt = vec_madd(vw7,v6,vt); + + v3 = vec_ld(0,x2+224); + vs = vec_madd(vw8,v3,vs); + v4 = vec_ld(16,x1-227); + v5 = vec_ld(0,x1-227); + v6 = vec_perm(v5,v4,vperm4); + vt = vec_madd(vw8,v6,vt); + + + v1 = vec_ld(0,wp-2); + v2 = vec_ld(16,wp-2); + v5 = vec_ld(0,wp+16); + v6 = vec_ld(16,wp+16); + v7 = vec_ld(32,wp+16); + v3 = vec_ld(0,wp+34); + v4 = vec_ld(16,wp+34); + v8 = vec_ld(0,wp+52); + v9 = vec_ld(16,wp+52); + v10 = vec_ld(32,wp+52); + + v5 = vec_perm(v5,v6,vperm2); + v6 = vec_perm(v6,v7,vperm2); + v7 = vec_perm(v8,v9,vperm2); + v8 = vec_perm(v9,v10,vperm2); + v9 = vec_mergeh(v1,v3); + v10 = vec_mergeh(v2,v4); + v11 = vec_mergeh(v5,v7); + v12 = vec_mergeh(v6,v8); + v13 = vec_mergel(v1,v3); + v14 = vec_mergel(v2,v4); + v15 = vec_mergel(v5,v7); + v16 = vec_mergel(v6,v8); + vw1 = vec_mergeh(v9,v11); + vw5 = vec_mergeh(v10,v12); + vw2 = vec_mergel(v9,v11); + vw6 = vec_mergel(v10,v12); + vw3 = vec_mergeh(v13,v15); + vw7 = vec_mergeh(v14,v16); + vw4 = vec_mergel(v13,v15); + vw8 = vec_mergel(v14,v16); + + v3 = vec_ld(0,x2+256); + vt = vec_nmsub(vw1,v3,vt); + v4 = vec_ld(16,x1-259); + v5 = vec_ld(0,x1-259); + v6 = vec_perm(v5,v4,vperm4); + vs = vec_madd(vw1,v6,vs); + + v3 = vec_ld(0,x2+192); + vt = vec_nmsub(vw2,v3,vt); + v4 = vec_ld(16,x1-195); + v5 = vec_ld(0,x1-195); + v6 = vec_perm(v5,v4,vperm4); + vs = vec_madd(vw2,v6,vs); + + v3 = vec_ld(0,x2+128); + vt = vec_nmsub(vw3,v3,vt); + v4 = vec_ld(16,x1-131); + v5 = vec_ld(0,x1-131); + v6 = vec_perm(v5,v4,vperm4); + vs = vec_madd(vw3,v6,vs); + + v3 = vec_ld(0,x2+64); + vt = vec_nmsub(vw4,v3,vt); + v4 = vec_ld(16,x1-67); + v5 = vec_ld(0,x1-67); + v6 = vec_perm(v5,v4,vperm4); + vs = vec_madd(vw4,v6,vs); + + + v3 = vec_ld(0,x2); + vt = vec_nmsub(vw5,v3,vt); + v4 = vec_ld(16,x1-3); + v5 = vec_ld(0,x1-3); + v6 = vec_perm(v5,v4,vperm4); + vs = vec_madd(vw5,v6,vs); + + v3 = vec_ld(0,x2-64); + vt = vec_nmsub(vw6,v3,vt); + v4 = vec_ld(16,x1+61); + v5 = vec_ld(0,x1+61); + v6 = vec_perm(v5,v4,vperm4); + vs = vec_madd(vw6,v6,vs); + + v3 = vec_ld(0,x2-128); + vt = vec_nmsub(vw7,v3,vt); + v4 = vec_ld(16,x1+125); + v5 = vec_ld(0,x1+125); + v6 = vec_perm(v5,v4,vperm4); + vs = vec_madd(vw7,v6,vs); + + v3 = vec_ld(0,x2-192); + vt = vec_nmsub(vw8,v3,vt); + v4 = vec_ld(16,x1+189); + v5 = vec_ld(0,x1+189); + v6 = vec_perm(v5,v4,vperm4); + vs = vec_madd(vw8,v6,vs); + + /*end*/ + + v3 = vec_ld(0,wp+6); + + v4 = vec_ld(0,wp+24); + v5 = vec_ld(16,wp+24); + v6 = vec_perm(v4,v5,vperm2); + + v9 = vec_ld(0,wp+42); + + v10 = vec_ld(0,wp+60); + v11 = vec_ld(16,wp+60); + v12 = vec_perm(v10,v11,vperm2); + + v13 = vec_mergeh(v3,v9); + v14 = vec_mergeh(v6,v12);; + vw1 = vec_mergeh(v13,v14); + vw2 = vec_mergel(v13,v14); + + vs = vec_madd(vs,vw1,vzero); + v1 = vec_sub(vt,vs); + v2 = vec_add(vt,vs); + v3 = vec_madd(vw2,v1,vzero); + v4 = vec_mergeh(v2,v3); + v5 = vec_mergel(v2,v3); + vec_st(v4,0,a+i*8); + vec_st(v5,16,a+i*8); + + wp += 72; + x1-=4; + x2+=4; + } + + v1 = vec_ld(0,wp-10); + v2 = vec_ld(16,wp-10); + v5 = vec_ld(0,wp+8); + v6 = vec_ld(16,wp+8); + v7 = vec_ld(32,wp+8); + v3 = vec_ld(0,wp+26); + v4 = vec_ld(16,wp+26); + v8 = vec_ld(0,wp+44); + v9 = vec_ld(16,wp+44); + v10 = vec_ld(32,wp+44); + + v5 = vec_perm(v5,v6,vperm2); + v6 = vec_perm(v6,v7,vperm2); + v7 = vec_perm(v8,v9,vperm2); + v8 = vec_perm(v9,v10,vperm2); + v9 = vec_mergeh(v1,v3); + v10 = vec_mergeh(v2,v4); + v11 = vec_mergeh(v5,v7); + v12 = vec_mergeh(v6,v8); + v13 = vec_mergel(v1,v3); + v14 = vec_mergel(v2,v4); + v15 = vec_mergel(v5,v7); + v16 = vec_mergel(v6,v8); + vw1 = vec_mergeh(v9,v11); + vw5 = vec_mergeh(v10,v12); + vw2 = vec_mergel(v9,v11); + vw6 = vec_mergel(v10,v12); + vw3 = vec_mergeh(v13,v15); + vw7 = vec_mergeh(v14,v16); + vw4 = vec_mergel(v13,v15); + vw8 = vec_mergel(v14,v16); + + v3 = vec_ld(0,x2-224); + vs = vec_madd(vw1,v3,vzero); + v4 = vec_ld(16,x1+221); + v5 = vec_ld(0,x1+221); + v6 = vec_perm(v5,v4,vperm4); + vt = vec_madd(vw1,v6,vzero); + + v3 = vec_ld(0,x2-160); + vs = vec_madd(vw2,v3,vs); + v4 = vec_ld(16,x1+157); + v5 = vec_ld(0,x1+157); + v6 = vec_perm(v5,v4,vperm4); + vt = vec_madd(vw2,v6,vt); + + v3 = vec_ld(0,x2-96); + vs = vec_madd(vw3,v3,vs); + v4 = vec_ld(16,x1+93); + v5 = vec_ld(0,x1+93); + v6 = vec_perm(v5,v4,vperm4); + vt = vec_madd(vw3,v6,vt); + + v3 = vec_ld(0,x2-32); + vs = vec_madd(vw4,v3,vs); + v4 = vec_ld(16,x1+29); + v5 = vec_ld(0,x1+29); + v6 = vec_perm(v5,v4,vperm4); + vt = vec_madd(vw4,v6,vt); + + + v3 = vec_ld(0,x2+32); + vs = vec_madd(vw5,v3,vs); + v4 = vec_ld(16,x1-35); + v5 = vec_ld(0,x1-35); + v6 = vec_perm(v5,v4,vperm4); + vt = vec_madd(vw5,v6,vt); + + v3 = vec_ld(0,x2+96); + vs = vec_madd(vw6,v3,vs); + v4 = vec_ld(16,x1-99); + v5 = vec_ld(0,x1-99); + v6 = vec_perm(v5,v4,vperm4); + vt = vec_madd(vw6,v6,vt); + + v3 = vec_ld(0,x2+160); + vs = vec_madd(vw7,v3,vs); + v4 = vec_ld(16,x1-163); + v5 = vec_ld(0,x1-163); + v6 = vec_perm(v5,v4,vperm4); + vt = vec_madd(vw7,v6,vt); + + v3 = vec_ld(0,x2+224); + vs = vec_madd(vw8,v3,vs); + v4 = vec_ld(16,x1-227); + v5 = vec_ld(0,x1-227); + v6 = vec_perm(v5,v4,vperm4); + vt = vec_madd(vw8,v6,vt); + + + v1 = vec_ld(0,wp-2); + v2 = vec_ld(16,wp-2); + v5 = vec_ld(0,wp+16); + v6 = vec_ld(16,wp+16); + v7 = vec_ld(32,wp+16); + v3 = vec_ld(0,wp+34); + v4 = vec_ld(16,wp+34); + v8 = vec_ld(0,wp+52); + v9 = vec_ld(16,wp+52); + v10 = vec_ld(32,wp+52); + + v5 = vec_perm(v5,v6,vperm2); + v6 = vec_perm(v6,v7,vperm2); + v7 = vec_perm(v8,v9,vperm2); + v8 = vec_perm(v9,v10,vperm2); + v9 = vec_mergeh(v1,v3); + v10 = vec_mergeh(v2,v4); + v11 = vec_mergeh(v5,v7); + v12 = vec_mergeh(v6,v8); + v13 = vec_mergel(v1,v3); + v14 = vec_mergel(v2,v4); + v15 = vec_mergel(v5,v7); + v16 = vec_mergel(v6,v8); + vw1 = vec_mergeh(v9,v11); + vw5 = vec_mergeh(v10,v12); + vw2 = vec_mergel(v9,v11); + vw6 = vec_mergel(v10,v12); + vw3 = vec_mergeh(v13,v15); + vw7 = vec_mergeh(v14,v16); + vw4 = vec_mergel(v13,v15); + vw8 = vec_mergel(v14,v16); + + v3 = vec_ld(0,x2+256); + vt = vec_nmsub(vw1,v3,vt); + v4 = vec_ld(16,x1-259); + v5 = vec_ld(0,x1-259); + v6 = vec_perm(v5,v4,vperm4); + vs = vec_madd(vw1,v6,vs); + + v3 = vec_ld(0,x2+192); + vt = vec_nmsub(vw2,v3,vt); + v4 = vec_ld(16,x1-195); + v5 = vec_ld(0,x1-195); + v6 = vec_perm(v5,v4,vperm4); + vs = vec_madd(vw2,v6,vs); + + v3 = vec_ld(0,x2+128); + vt = vec_nmsub(vw3,v3,vt); + v4 = vec_ld(16,x1-131); + v5 = vec_ld(0,x1-131); + v6 = vec_perm(v5,v4,vperm4); + vs = vec_madd(vw3,v6,vs); + + v3 = vec_ld(0,x2+64); + vt = vec_nmsub(vw4,v3,vt); + v4 = vec_ld(16,x1-67); + v5 = vec_ld(0,x1-67); + v6 = vec_perm(v5,v4,vperm4); + vs = vec_madd(vw4,v6,vs); + + + v3 = vec_ld(0,x2); + vt = vec_nmsub(vw5,v3,vt); + v4 = vec_ld(16,x1-3); + v5 = vec_ld(0,x1-3); + v6 = vec_perm(v5,v4,vperm4); + vs = vec_madd(vw5,v6,vs); + + v3 = vec_ld(0,x2-64); + vt = vec_nmsub(vw6,v3,vt); + v4 = vec_ld(16,x1+61); + v5 = vec_ld(0,x1+61); + v6 = vec_perm(v5,v4,vperm4); + vs = vec_madd(vw6,v6,vs); + + v3 = vec_ld(0,x2-128); + vt = vec_nmsub(vw7,v3,vt); + v4 = vec_ld(16,x1+125); + v5 = vec_ld(0,x1+125); + v6 = vec_perm(v5,v4,vperm4); + vs = vec_madd(vw7,v6,vs); + + v3 = vec_ld(0,x2-192); + vt = vec_nmsub(vw8,v3,vt); + v4 = vec_ld(16,x1+189); + v5 = vec_ld(0,x1+189); + v6 = vec_perm(v5,v4,vperm4); + vs = vec_madd(vw8,v6,vs); + + /*end*/ + + v3 = vec_ld(0,wp+6); + + v4 = vec_ld(0,wp+24); + v5 = vec_ld(16,wp+24); + v6 = vec_perm(v4,v5,vperm2); + + v9 = vec_ld(0,wp+42); + + v10 = vec_ld(0,wp+60); + v11 = vec_ld(16,wp+60); + v12 = vec_perm(v10,v11,vperm2); + + v13 = vec_mergeh(v3,v9); + v14 = vec_mergeh(v6,v12);; + vw1 = vec_mergeh(v13,v14); + vw2 = vec_mergel(v13,v14); + + vs = vec_madd(vs,vw1,vzero); + v1 = vec_sub(vt,vs); + v2 = vec_add(vt,vs); + v3 = vec_madd(vw2,v1,vzero); + v4 = vec_ld(16,a+24); + v5 = vec_mergeh(v2,v3); + v6 = vec_mergel(v2,v3); + v7 = vec_perm(v6,v4,vperm3); + vec_st(v5,0,a+24); + vec_st(v7,16,a+24); + + wp += 54; + x1-=3; + x2+=3; +#else for (i = -15; i < 0; i++) { FLOAT w, s, t; @@ -501,6 +942,7 @@ window_subband(const sample_t * x1, FLOAT a[SBLIMIT]) x1--; x2++; } +#endif { FLOAT s, t, u, v; t = x1[-16] * wp[-10]; diff --git libmp3lame/psymodel.c libmp3lame/psymodel.c index 60076ee..a168605 100644 --- libmp3lame/psymodel.c +++ libmp3lame/psymodel.c @@ -146,6 +146,10 @@ blocktype_d[2] block type to use for previous granule #include +#if __ALTIVEC__ +#include +#endif + #include "lame.h" #include "machine.h" #include "encoder.h" @@ -164,6 +168,48 @@ blocktype_d[2] block type to use for previous granule #define LN_TO_LOG10 0.2302585093 #endif +#if __ALTIVEC__ +static inline vector float fast_log10_altivec_2(vector float v3) +{ + vector float va,vb,vc,vhalf,vzero,vsqrt2,vconst4; + vector float v1,v2,v4,v5,v6,v7,v8,vz,vz2,vlog; + vector unsigned int vconst1,vconst2,vshamt; + vector signed int vconst3; + + va = (vector float)VINIT4ALL(0.8685890659); + vb = (vector float)VINIT4ALL(0.2894672153); + vc = (vector float)VINIT4ALL(0.1793365895); + vhalf = (vector float)VINIT4ALL(0.15051499783); + vsqrt2 = (vector float)VINIT4ALL(1.4142135623731); + vconst4 = (vector float)VINIT4ALL(0.301029995664); + vzero = vec_xor(vzero,vzero); + vconst1 = (vector unsigned int)vec_sr(vec_splat_s32(-1),vec_splat_u32(9)); + vconst2 = (vector unsigned int)vec_sr(vec_splat_s32(-1),vec_splat_u32(7)); + vconst2 = vec_nor(vconst2,vconst2); + vconst3 = (vector signed int)vec_rl(vconst2,vec_splat_u32(7)); + vshamt = vec_add(vec_splat_u32(9),vec_splat_u32(7)); + vshamt = vec_add(vshamt,vec_splat_u32(7)); + vconst2 = vec_sl((vector unsigned int)vconst3,vshamt); + + v4 = (vector float)vec_sel(vconst2,(vector unsigned int)v3,vconst1); + v5 = vec_add(v4,vsqrt2); + v6 = vec_sub(v4,vsqrt2); + v7 = vec_re(v5); + vz = vec_madd(v6, vec_madd(vec_nmsub(v7,v5,(vector float)vconst2),v7,v7), vzero); + v8 = (vector float)vec_sr((vector unsigned int)v3,vshamt); + vlog = vec_ctf(vec_sub((vector signed int)v8,vconst3),0); + + vz2 = vec_madd(vz,vz,vzero); + vlog = vec_madd(vlog,vconst4,vhalf); + + v1 = vec_madd(vz2,vc,vb); + v2 = vec_madd(vz2,v1,va); + vlog = vec_madd(vz,v2,vlog); + + return vlog; +} +#endif + /* L3psycho_anal. Compute psycho acoustics. @@ -253,6 +299,11 @@ static const FLOAT ma_max_i1 = 3.6517412725483771; static const FLOAT ma_max_i2 = 31.622776601683793; /* pow(10, (MLIMIT) / 10.0); */ static const FLOAT ma_max_m = 31.622776601683793; +#if __ALTIVEC__ +static const vector float vmamax1 = (vector float)VINIT4ALL(3.651741); +static const vector float vmamax2 = (vector float)VINIT4ALL(31.622777); +#endif + /*This is the masking table: According to tonality, values are going from 0dB (TMN) @@ -666,6 +717,14 @@ static void vbrpsy_compute_fft_l(lame_internal_flags * gfc, const sample_t * const buffer[2], int chn, int gr_out, FLOAT fftenergy[HBLKSIZE], FLOAT(*wsamp_l)[BLKSIZE]) { +#if __ALTIVEC__ + vector float v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,vhalf,vprev,vzero,vsqrt2; + vector unsigned char vperm; + vhalf = vec_ctf(vec_splat_s32(1),1); + vsqrt2 = (vector float)VINIT4ALL(0.7071067811865001); + vzero = vec_xor(vzero,vzero); + vperm = (vector unsigned char)VINIT16(0,1,2,3,28,29,30,31,24,25,26,27,20,21,22,23); +#endif SessionConfig_t const *const cfg = &gfc->cfg; PsyStateVar_t *psv = &gfc->sv_psy; plotting_data *plt = cfg->analysis ? gfc->pinfo : 0; @@ -675,19 +734,80 @@ vbrpsy_compute_fft_l(lame_internal_flags * gfc, const sample_t * const buffer[2] fft_long(gfc, *wsamp_l, chn, buffer); } else if (chn == 2) { - FLOAT const sqrt2_half = SQRT2 * 0.5f; /* FFT data for mid and side channel is derived from L & R */ +#if __ALTIVEC__ + for(j = 0; j < BLKSIZE; j += 8) { + v1 = vec_ld(0,wsamp_l[0]+j); + v2 = vec_ld(0,wsamp_l[1]+j); + v3 = vec_ld(16,wsamp_l[0]+j); + v4 = vec_ld(16,wsamp_l[1]+j); + + v5 = vec_add(v1,v2); + v6 = vec_sub(v1,v2); + v7 = vec_add(v3,v4); + v8 = vec_sub(v3,v4); + v9 = vec_madd(v5,vsqrt2,vzero); + v10 = vec_madd(v6,vsqrt2,vzero); + v11 = vec_madd(v7,vsqrt2,vzero); + v12 = vec_madd(v8,vsqrt2,vzero); + + vec_st(v9,0,wsamp_l[0]+j); + vec_st(v10,0,wsamp_l[1]+j); + vec_st(v11,16,wsamp_l[0]+j); + vec_st(v12,16,wsamp_l[1]+j); + } +#else + FLOAT const sqrt2_half = SQRT2 * 0.5f; for (j = BLKSIZE - 1; j >= 0; --j) { FLOAT const l = wsamp_l[0][j]; FLOAT const r = wsamp_l[1][j]; wsamp_l[0][j] = (l + r) * sqrt2_half; wsamp_l[1][j] = (l - r) * sqrt2_half; } +#endif } /********************************************************************* * compute energies *********************************************************************/ +#if __ALTIVEC__ + vprev = vec_ld(0,(*wsamp_l)); + for(j = 0; j < BLKSIZE/2; j += 16) { + v1 = vec_ld(0,(*wsamp_l)+j); + v2 = vec_ld(16,(*wsamp_l)+j); + v3 = vec_ld(32,(*wsamp_l)+j); + v4 = vec_ld(48,(*wsamp_l)+j); + v5 = vec_ld(48,(*wsamp_l)+1008-j); + v6 = vec_ld(32,(*wsamp_l)+1008-j); + v7 = vec_ld(16,(*wsamp_l)+1008-j); + v8 = vec_ld(0,(*wsamp_l)+1008-j); + v9 = vec_perm(vprev,v5,vperm); + v10 = vec_perm(v5,v6,vperm); + v11 = vec_perm(v6,v7,vperm); + v12 = vec_perm(v7,v8,vperm); + vprev = v8; + v1 = vec_madd(v1,v1,vzero); + v2 = vec_madd(v2,v2,vzero); + v3 = vec_madd(v3,v3,vzero); + v4 = vec_madd(v4,v4,vzero); + v5 = vec_madd(v9,v9,v1); + v6 = vec_madd(v10,v10,v2); + v7 = vec_madd(v11,v11,v3); + v8 = vec_madd(v12,v12,v4); + v9 = vec_madd(v5,vhalf,vzero); + v10 = vec_madd(v6,vhalf,vzero); + v11 = vec_madd(v7,vhalf,vzero); + v12 = vec_madd(v8,vhalf,vzero); + + vec_st(v9,0,fftenergy+j); + vec_st(v10,16,fftenergy+j); + vec_st(v11,32,fftenergy+j); + vec_st(v12,48,fftenergy+j); + } + + v1 = vec_madd(vprev,vprev,vzero); + vec_ste(v1,0,fftenergy+j); +#else fftenergy[0] = wsamp_l[0][0]; fftenergy[0] *= fftenergy[0]; @@ -696,13 +816,51 @@ vbrpsy_compute_fft_l(lame_internal_flags * gfc, const sample_t * const buffer[2] FLOAT const im = (*wsamp_l)[BLKSIZE / 2 + j]; fftenergy[BLKSIZE / 2 - j] = (re * re + im * im) * 0.5f; } +#endif /* total energy */ { +#if __ALTIVEC__ +#if _ARCH_PPC64 + v5 = vec_ld(0,fftenergy+8); + v6 = vec_ld(0,fftenergy+508); + v7 = vec_ld(0,fftenergy+512); + v8 = vec_xor(v8,v8); + v5 = vec_sld(v5,v8,12); + v7 = vec_sld(v8,v7,4); +#else + v5 = vec_lde(0,fftenergy+11); + v6 = vec_ld(0,fftenergy+508); + v7 = vec_lde(0,fftenergy+512); + v8 = vec_xor(v8,v8); +#endif + for(j=12;j<508;j+=16) { + v1 = vec_ld(0,fftenergy+j); + v2 = vec_ld(16,fftenergy+j); + v3 = vec_ld(32,fftenergy+j); + v4 = vec_ld(48,fftenergy+j); + v5 = vec_add(v1,v5); + v6 = vec_add(v2,v6); + v7 = vec_add(v3,v7); + v8 = vec_add(v4,v8); + } + v5 = vec_add(v5,v6); + v7 = vec_add(v7,v8); + v5 = vec_add(v5,v7); + v6 = vec_sld(v5,v5,4); + v7 = vec_sld(v5,v5,8); + v8 = vec_sld(v5,v5,12); + v5 = vec_add(v5,v6); + v7 = vec_add(v7,v8); + v5 = vec_add(v5,v7); + v5 = vec_perm(v5,v5,vec_lvsr(0, psv->tot_ener+chn)); + vec_ste(v5,0,psv->tot_ener+chn); +#else FLOAT totalenergy = 0.0f; for (j = 11; j < HBLKSIZE; j++) totalenergy += fftenergy[j]; psv->tot_ener[chn] = totalenergy; +#endif } if (plt) { @@ -716,27 +874,96 @@ vbrpsy_compute_fft_l(lame_internal_flags * gfc, const sample_t * const buffer[2] static void vbrpsy_compute_fft_s(lame_internal_flags const *gfc, const sample_t * const buffer[2], int chn, - int sblock, FLOAT(*fftenergy_s)[HBLKSIZE_s], FLOAT(*wsamp_s)[3][BLKSIZE_s]) + int sblock, FLOAT(*fftenergy_s)[HBLKSIZE_s+3], FLOAT(*wsamp_s)[3][BLKSIZE_s]) { +#if __ALTIVEC__ + vector float v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,vhalf,vprev,vzero,vsqrt2; + vector unsigned char vperm; + vhalf = vec_ctf(vec_splat_s32(1),1); + vsqrt2 = (vector float)VINIT4ALL(0.7071067811865001); + vzero = vec_xor(vzero,vzero); + vperm = (vector unsigned char)VINIT16(0,1,2,3,28,29,30,31,24,25,26,27,20,21,22,23); +#endif int j; if (sblock == 0 && chn < 2) { fft_short(gfc, *wsamp_s, chn, buffer); } if (chn == 2) { - FLOAT const sqrt2_half = SQRT2 * 0.5f; /* FFT data for mid and side channel is derived from L & R */ +#if __ALTIVEC__ + for(j = 0; j < BLKSIZE_s; j += 8) { + v1 = vec_ld(0,wsamp_s[0][sblock]+j); + v2 = vec_ld(0,wsamp_s[1][sblock]+j); + v3 = vec_ld(16,wsamp_s[0][sblock]+j); + v4 = vec_ld(16,wsamp_s[1][sblock]+j); + + v5 = vec_add(v1,v2); + v6 = vec_sub(v1,v2); + v7 = vec_add(v3,v4); + v8 = vec_sub(v3,v4); + v9 = vec_madd(v5,vsqrt2,vzero); + v10 = vec_madd(v6,vsqrt2,vzero); + v11 = vec_madd(v7,vsqrt2,vzero); + v12 = vec_madd(v8,vsqrt2,vzero); + + vec_st(v9,0,wsamp_s[0][sblock]+j); + vec_st(v10,0,wsamp_s[1][sblock]+j); + vec_st(v11,16,wsamp_s[0][sblock]+j); + vec_st(v12,16,wsamp_s[1][sblock]+j); + } +#else + FLOAT const sqrt2_half = SQRT2 * 0.5f; for (j = BLKSIZE_s - 1; j >= 0; --j) { FLOAT const l = wsamp_s[0][sblock][j]; FLOAT const r = wsamp_s[1][sblock][j]; wsamp_s[0][sblock][j] = (l + r) * sqrt2_half; wsamp_s[1][sblock][j] = (l - r) * sqrt2_half; } +#endif } /********************************************************************* * compute energies *********************************************************************/ +#if __ALTIVEC__ + vprev = vec_ld(0,(*wsamp_s)[sblock]); + for(j = 0; j < BLKSIZE_s/2; j += 16) { + v1 = vec_ld(0,(*wsamp_s)[sblock]+j); + v2 = vec_ld(16,(*wsamp_s)[sblock]+j); + v3 = vec_ld(32,(*wsamp_s)[sblock]+j); + v4 = vec_ld(48,(*wsamp_s)[sblock]+j); + v5 = vec_ld(48,(*wsamp_s)[sblock]+240-j); + v6 = vec_ld(32,(*wsamp_s)[sblock]+240-j); + v7 = vec_ld(16,(*wsamp_s)[sblock]+240-j); + v8 = vec_ld(0,(*wsamp_s)[sblock]+240-j); + v9 = vec_perm(vprev,v5,vperm); + v10 = vec_perm(v5,v6,vperm); + v11 = vec_perm(v6,v7,vperm); + v12 = vec_perm(v7,v8,vperm); + vprev = v8; + v1 = vec_madd(v1,v1,vzero); + v2 = vec_madd(v2,v2,vzero); + v3 = vec_madd(v3,v3,vzero); + v4 = vec_madd(v4,v4,vzero); + v5 = vec_madd(v9,v9,v1); + v6 = vec_madd(v10,v10,v2); + v7 = vec_madd(v11,v11,v3); + v8 = vec_madd(v12,v12,v4); + v9 = vec_madd(v5,vhalf,vzero); + v10 = vec_madd(v6,vhalf,vzero); + v11 = vec_madd(v7,vhalf,vzero); + v12 = vec_madd(v8,vhalf,vzero); + + vec_st(v9,0,fftenergy_s[sblock]+j); + vec_st(v10,16,fftenergy_s[sblock]+j); + vec_st(v11,32,fftenergy_s[sblock]+j); + vec_st(v12,48,fftenergy_s[sblock]+j); + } + + v1 = vec_madd(vprev,vprev,vzero); + vec_ste(v1,0,fftenergy_s[sblock]+j); +#else fftenergy_s[sblock][0] = (*wsamp_s)[sblock][0]; fftenergy_s[sblock][0] *= fftenergy_s[sblock][0]; for (j = BLKSIZE_s / 2 - 1; j >= 0; --j) { @@ -744,6 +971,7 @@ vbrpsy_compute_fft_s(lame_internal_flags const *gfc, const sample_t * const buff FLOAT const im = (*wsamp_s)[sblock][BLKSIZE_s / 2 + j]; fftenergy_s[sblock][BLKSIZE_s / 2 - j] = (re * re + im * im) * 0.5f; } +#endif } @@ -772,7 +1000,24 @@ vbrpsy_attack_detection(lame_internal_flags * gfc, const sample_t * const buffer FLOAT energy[4], FLOAT sub_short_factor[4][3], int ns_attacks[4][4], int uselongblock[2]) { - FLOAT ns_hpfsmpl[2][576]; +#if __ALTIVEC__ + vector float v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15,v16; + vector float vsum,vsum1,vsum2,vsuma,vsumb,vsumc,vsumd,vmaska,vmaskb,vmaskc,vmaskd; + vector unsigned char vmask1,vmask2,vmask3,vmask4,vmask1inv,vmask2inv,vmask3inv,vmask4inv,vperm,vs4,vs8,vs12; + + vperm = (vector unsigned char)VINIT16(12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3); + v1 = (vector float)vec_splat_u8(1); + v2 = (vector float)vec_splat_u8(5); + vs4 = vec_sl((vector unsigned char)v1,(vector unsigned char)v2); + vs8 = vec_sl(vs4,(vector unsigned char)v1); + vs12 = vec_or(vs4,vs8); + v3 = (vector float)vec_splat_s32(-1); + vmaska = vec_slo(v3,vs12); + vmaskb = vec_sro(vmaska,vs4); + vmaskc = vec_sro(vmaska,vs8); + vmaskd = vec_sro(vmaska,vs12); +#endif + FLOAT ns_hpfsmpl[2][576] __attribute__ ((aligned (16))); SessionConfig_t const *const cfg = &gfc->cfg; PsyStateVar_t *const psv = &gfc->sv_psy; plotting_data *plt = cfg->analysis ? gfc->pinfo : 0; @@ -785,14 +1030,142 @@ vbrpsy_attack_detection(lame_internal_flags * gfc, const sample_t * const buffer /* Don't copy the input buffer into a temporary buffer */ /* unroll the loop 2 times */ for (chn = 0; chn < n_chn_out; chn++) { - static const FLOAT fircoef[] = { + static const FLOAT fircoef[] __attribute__ ((aligned (16))) = { -8.65163e-18 * 2, -0.00851586 * 2, -6.74764e-18 * 2, 0.0209036 * 2, -3.36639e-17 * 2, -0.0438162 * 2, -1.54175e-17 * 2, 0.0931738 * 2, - -5.52212e-17 * 2, -0.313819 * 2 + -5.52212e-17 * 2, -0.313819 * 2, 0.0, 0.0 }; /* apply high pass filter of fs/4 */ const sample_t *const firbuf = &buffer[chn][576 - 350 - NSFIRLEN + 192]; - assert(dimension_of(fircoef) == ((NSFIRLEN - 1) / 2)); + //assert(dimension_of(fircoef) == ((NSFIRLEN - 1) / 2)); +#if __ALTIVEC__ + v1 = vec_ld(0, firbuf+10); + vmask1 = vec_lvsl(0, firbuf); + vmask2 = vec_lvsl(0, firbuf+1); + vmask3 = vec_lvsl(0, firbuf+2); + vmask4 = vec_lvsl(0, firbuf+3); + vmask1inv = vec_perm(vmask1,vmask1,vperm); + vmask2inv = vec_perm(vmask2,vmask2,vperm); + vmask3inv = vec_perm(vmask3,vmask3,vperm); + vmask4inv = vec_perm(vmask4,vmask4,vperm); + for(i=0;i<576;) { + v2 = vec_ld(16,firbuf+i+10); + vsum1 = vec_perm(v1, v2, vmask3); + v1 = v2; + + vsum2 = vec_splat(vsum1, 0); + vsum = vec_and(vsum2, vmaska); + v3 = vec_ld(0, firbuf+i); + v4 = vec_ld(16,firbuf+i+NSFIRLEN-3); + for(j=0;j<(NSFIRLEN-1)/2;j+=4) { + v5 = vec_ld(16, firbuf+i+j); + v6 = vec_ld(0, firbuf+i+NSFIRLEN-3-j); + v7 = vec_perm(v3,v5,vmask1); + v8 = vec_perm(v6,v4,vmask3inv); + v3 = v5; + v4 = v6; + v10 = vec_ld(0,fircoef+j); + v11 = vec_add(v7,v8); + vsum = vec_madd(v10,v11,vsum); + } + + v12 = vec_slo(vsum,vs4); + v13 = vec_slo(vsum,vs8); + v14 = vec_slo(vsum,vs12); + v15 = vec_add(vsum,v12); + v16 = vec_add(v13,v14); + vsuma = vec_add(v15,v16); + vsuma = vec_and(vsuma,vmaska); + + i++; + + vsum2 = vec_splat(vsum1, 1); + vsum = vec_and(vsum2, vmaska); + v3 = vec_ld(0, firbuf+i); + v4 = vec_ld(16,firbuf+i+NSFIRLEN-3); + vmask2 = vec_lvsl(0, firbuf+i); + for(j=0;j<(NSFIRLEN-1)/2;j+=4) { + v5 = vec_ld(16, firbuf+i+j); + v6 = vec_ld(0, firbuf+i+NSFIRLEN-3-j); + v7 = vec_perm(v3,v5,vmask2); + v8 = vec_perm(v6,v4,vmask4inv); + v3 = v5; + v4 = v6; + v10 = vec_ld(0,fircoef+j); + v11 = vec_add(v7,v8); + vsum = vec_madd(v10,v11,vsum); + } + + v12 = vec_sro(vsum,vs4); + v13 = vec_slo(vsum,vs4); + v14 = vec_slo(vsum,vs8); + v15 = vec_add(vsum,v12); + v16 = vec_add(v13,v14); + vsumb = vec_add(v15,v16); + vsumb = vec_and(vsumb,vmaskb); + + i++; + + vsum2 = vec_splat(vsum1, 2); + vsum = vec_and(vsum2, vmaska); + v3 = vec_ld(0, firbuf+i); + v4 = vec_ld(16,firbuf+i+NSFIRLEN-3); + vmask2 = vec_lvsl(0, firbuf+i); + for(j=0;j<(NSFIRLEN-1)/2;j+=4) { + v5 = vec_ld(16, firbuf+i+j); + v6 = vec_ld(0, firbuf+i+NSFIRLEN-3-j); + v7 = vec_perm(v3,v5,vmask3); + v8 = vec_perm(v6,v4,vmask1inv); + v3 = v5; + v4 = v6; + v10 = vec_ld(0,fircoef+j); + v11 = vec_add(v7,v8); + vsum = vec_madd(v10,v11,vsum); + } + + v12 = vec_sro(vsum,vs4); + v13 = vec_sro(vsum,vs8); + v14 = vec_slo(vsum,vs4); + v15 = vec_add(vsum,v12); + v16 = vec_add(v13,v14); + vsumc = vec_add(v15,v16); + vsumc = vec_and(vsumc,vmaskc); + + i++; + + vsum2 = vec_splat(vsum1, 3); + vsum = vec_and(vsum2, vmaska); + v3 = vec_ld(0, firbuf+i); + v4 = vec_ld(16,firbuf+i+NSFIRLEN-3); + vmask2 = vec_lvsl(0, firbuf+i); + for(j=0;j<(NSFIRLEN-1)/2;j+=4) { + v5 = vec_ld(16, firbuf+i+j); + v6 = vec_ld(0, firbuf+i+NSFIRLEN-3-j); + v7 = vec_perm(v3,v5,vmask4); + v8 = vec_perm(v6,v4,vmask2inv); + v3 = v5; + v4 = v6; + v10 = vec_ld(0,fircoef+j); + v11 = vec_add(v7,v8); + vsum = vec_madd(v10,v11,vsum); + } + + v12 = vec_sro(vsum,vs4); + v13 = vec_sro(vsum,vs8); + v14 = vec_sro(vsum,vs12); + v15 = vec_add(vsum,v12); + v16 = vec_add(v13,v14); + vsumd = vec_add(v15,v16); + vsumd = vec_and(vsumd,vmaskd); + + vsum1 = vec_or(vsuma,vsumb); + vsum2 = vec_or(vsumc,vsumd); + vsum = vec_or(vsum1,vsum2); + + i++; + vec_st(vsum,0,ns_hpfsmpl[chn]+i-4); + } +#else for (i = 0; i < 576; i++) { FLOAT sum1, sum2; sum1 = firbuf[i + 10]; @@ -803,6 +1176,7 @@ vbrpsy_attack_detection(lame_internal_flags * gfc, const sample_t * const buffer } ns_hpfsmpl[chn][i] = sum1 + sum2; } +#endif masking_ratio[gr_out][chn].en = psv->en[chn]; masking_ratio[gr_out][chn].thm = psv->thm[chn]; if (n_chn_psy > 2) { @@ -841,9 +1215,28 @@ vbrpsy_attack_detection(lame_internal_flags * gfc, const sample_t * const buffer for (i = 0; i < 9; i++) { FLOAT const *const pfe = pf + 576 / 9; FLOAT p = 1.; +#if __ALTIVEC__ + FLOAT vmax[4] __attribute__ ((aligned (16))); + v1 = (vector float)vec_splat_s32(1); + v2 = vec_ctf((vector signed int)v1,0); + for (; pf < pfe; pf+=4) { + v3 = vec_ld(0,pf); + v4 = vec_abs(v3); + v2 = vec_max(v2,v4); + } + v5 = vec_slo(v2,vs4); + v6 = vec_slo(v2,vs8); + v7 = vec_slo(v2,vs12); + v8 = vec_max(v2,v5); + v9 = vec_max(v6,v7); + v10 =vec_max(v8,v9); + vec_st(v10,0,vmax); + p = vmax[0]; +#else for (; pf < pfe; pf++) if (p < fabs(*pf)) p = fabs(*pf); +#endif psv->last_en_subshort[chn][i] = en_subshort[i + 3] = p; en_short[1 + i / 3] += p; if (p > en_subshort[i + 3 - 2]) { @@ -1039,7 +1432,7 @@ vbrpsy_calc_mask_index_s(lame_internal_flags const *gfc, FLOAT const *max, static void -vbrpsy_compute_masking_s(lame_internal_flags * gfc, const FLOAT(*fftenergy_s)[HBLKSIZE_s], +vbrpsy_compute_masking_s(lame_internal_flags * gfc, const FLOAT(*fftenergy_s)[HBLKSIZE_s+3], FLOAT * eb, FLOAT * thr, int chn, int sblock) { PsyStateVar_t *const psv = &gfc->sv_psy; @@ -1147,24 +1540,286 @@ vbrpsy_compute_masking_l(lame_internal_flags * gfc, const FLOAT fftenergy[HBLKSI { PsyStateVar_t *const psv = &gfc->sv_psy; PsyConst_CB2SB_t const *const gdl = &gfc->cd_psy->l; - FLOAT max[CBANDS], avg[CBANDS]; - unsigned char mask_idx_l[CBANDS + 2]; + FLOAT max[CBANDS] __attribute__ ((aligned (16))), avg[CBANDS]; + unsigned char mask_idx_l[CBANDS + 2] __attribute__ ((aligned (16))); int k, b; +#if __ALTIVEC__ + float tmp[4] __attribute__ ((aligned (16))); + const vector unsigned char v31 = (vector unsigned char)VINIT16ALL(31); + const vector unsigned int vmask1 = (vector unsigned int)VINIT4ALL(0xff); + const vector signed int vone = (vector signed int)VINIT4ALL(1); + const vector unsigned int vtab1 = (vector unsigned int)VINIT4(0x3f800000,0x3f4b5936,0x3f218698,0x3f218698); + const vector unsigned int vtab2 = (vector unsigned int)VINIT4(0x3f218698,0x3f218698,0x3f218698,0x3e809bfa); + const vector unsigned int vtab3 = (vector unsigned int)VINIT4(0x3df09e99,0,0,0); + const vector unsigned int vtable1 = (vector unsigned int)VINIT4(0x3fe39e89,0x3fec53e5,0x3ff55ea7,0x3ff9149b); + const vector unsigned int vtable2 = (vector unsigned int)VINIT4(0x3ffcd90e,0x3fea8f7b,0x3fd997da,0x3fbf84e2); + const vector unsigned int vtable3 = (vector unsigned int)VINIT4(0x3fa8917c,0x3f800000,0,0); + const vector float vzero = vec_xor(vzero,vzero); +#endif /********************************************************************* * Calculate the energy and the tonality of each partition. *********************************************************************/ calc_energy(gdl, fftenergy, eb_l, max, avg); calc_mask_index_l(gfc, max, avg, mask_idx_l); +#if __ALTIVEC__ + const vector unsigned char vmaskidx1 = vec_ld(0,mask_idx_l); //needs to be aligned + const vector unsigned char vmaskidx2 = vec_ld(16,mask_idx_l); + const vector unsigned char vmaskidx3 = vec_ld(32,mask_idx_l); + const vector unsigned char vmaskidx4 = vec_ld(48,mask_idx_l); + tmp[0] = gfc->sv_qnt.masking_lower; + vector float vmasking_lower_coeff = vec_ld(0,tmp); + vmasking_lower_coeff = vec_splat(vmasking_lower_coeff,0); +#endif /********************************************************************* * convolve the partitioned energy and unpredictability * with the spreading function, s3_l[b][k] ********************************************************************/ k = 0; - for (b = 0; b < gdl->npart; b++) { +#if __ALTIVEC__ + for (b = 0; b < gdl->npart-3; b+=4) { + vector signed int v1,v2,v3,v4,v5,vkk,vkk2,vlast,vdd,vdd_n,vk,vk2; + vector float vf1,vf2,vf3,vf4,vecb,vx,veb,vavgmask,vmasking_lower; + vmasking_lower = vec_ld(0,gdl->masking_lower+b); + vmasking_lower = vec_madd(vmasking_lower,vmasking_lower_coeff,vzero); + int tmp2[4] __attribute__ ((aligned (16))); + int tmp3[4] __attribute__ ((aligned (16))); + + v1 = vec_ld(0,gdl->s3ind[b]); //needs to be aligned + v2 = vec_ld(0,gdl->s3ind[b+1]); + v3 = vec_ld(0,gdl->s3ind[b+2]); + v4 = vec_ld(0,gdl->s3ind[b+3]); + v1 = vec_mergeh(v1,v3); + v2 = vec_mergeh(v2,v4); + vkk = vec_mergeh(v1,v2); + vlast = vec_mergel(v1,v2); + + v1 = vec_sub(vlast,vkk); + v1 = vec_sel(v1,(vector signed int)vzero,vec_cmpgt((vector signed int)vzero,v1)); + vec_st(v1,0,tmp2); + + tmp3[0] = k; + tmp3[1] = k+tmp2[0]+1; + tmp3[2] = k+tmp2[0]+tmp2[1]+2; + tmp3[3] = k+tmp2[0]+tmp2[1]+tmp2[2]+3; + k = k+tmp2[0]+tmp2[1]+tmp2[2]+tmp2[3]+4; + vk = vec_ld(0,tmp3); + + v1 = (vector signed int)vec_perm(vmaskidx1,vmaskidx2,(vector unsigned char)vkk); + v2 = (vector signed int)vec_perm(vmaskidx3,vmaskidx4,(vector unsigned char)vkk); + vdd = vec_sel(v1,v2,vec_cmpgt(vkk,(vector signed int)VINIT4ALL(31))); + vdd = vec_and(vdd,(vector signed int)vmask1); + vdd_n = vone; + + tmp[0] = gdl->s3[tmp3[0]]; + tmp[1] = gdl->s3[tmp3[1]]; + tmp[2] = gdl->s3[tmp3[2]]; + tmp[3] = gdl->s3[tmp3[3]]; + vf1 = vec_ld(0,tmp); + + vec_st(vkk,0,tmp2); + tmp[0] = eb_l[tmp2[0]]; + tmp[1] = eb_l[tmp2[1]]; + tmp[2] = eb_l[tmp2[2]]; + tmp[3] = eb_l[tmp2[3]]; + veb = vec_ld(0,tmp); + + vecb = vec_madd(vf1,veb,vzero); + + v1 = vec_sl(vdd,vec_splat_u32(2)); + v2 = vec_add(v1,vec_splat_s32(1)); + v3 = vec_add(v1,vec_splat_s32(2)); + v4 = vec_add(v2,vec_splat_s32(2)); + v1 = vec_sl(v1,vec_splat_u32(-8)); + v2 = vec_sl(v2,vec_splat_u32(-16)); + v3 = vec_sl(v3,vec_splat_u32(8)); + v1 = vec_or(v1,v2); + v3 = vec_or(v3,v4); + v1 = vec_or(v1,v3); + + vf1 = (vector float)vec_perm(vtab1,vtab2,(vector unsigned char)v1); + vf2 = (vector float)vec_perm(vtab3,vtab2,(vector unsigned char)v1); + vf2 = vec_sel(vf1,vf2,(vector unsigned int)vec_cmpgt((vector unsigned char)v1,v31)); + vecb = vec_madd(vecb,vf2,vzero); + + vkk = vec_add(vkk,vone); + vk = vec_add(vk,vone); + while(vec_any_le(vkk,vlast)) { + vkk2 = vec_sel(vkk,vlast,vec_cmpgt(vkk,vlast)); + vk2 = vec_sel(vk,(vector signed int)vzero,vec_cmpgt(vkk,vlast)); + v1 = (vector signed int)vec_perm(vmaskidx1,vmaskidx2,(vector unsigned char)vkk2); + v2 = (vector signed int)vec_perm(vmaskidx3,vmaskidx4,(vector unsigned char)vkk2); + v1 = vec_sel(v1,v2,vec_cmpgt(vkk2,(vector signed int)VINIT4ALL(31))); + v1 = vec_and(v1,(vector signed int)vmask1); + v2 = (vector signed int)vec_cmpgt(vkk,vlast); + v2 = vec_nor(v2,v2); + v5 = vec_and(v1,v2); + v2 = vec_and(vone,v2); + vdd = vec_add(vdd,v5); + vdd_n = vec_add(vdd_n,v2); + + vec_st(vk2,0,tmp2); + tmp[0] = gdl->s3[tmp2[0]]; + tmp[1] = gdl->s3[tmp2[1]]; + tmp[2] = gdl->s3[tmp2[2]]; + tmp[3] = gdl->s3[tmp2[3]]; + vf1 = vec_ld(0,tmp); + + vec_st(vkk,0,tmp2); + tmp[0] = eb_l[tmp2[0]]; + tmp[1] = eb_l[tmp2[1]]; + tmp[2] = eb_l[tmp2[2]]; + tmp[3] = eb_l[tmp2[3]]; + veb = vec_ld(0,tmp); + + vx = vec_madd(vf1,veb,vzero); + + v1 = vec_sl(v5,vec_splat_u32(2)); + v2 = vec_add(v1,vec_splat_s32(1)); + v3 = vec_add(v1,vec_splat_s32(2)); + v4 = vec_add(v2,vec_splat_s32(2)); + v1 = vec_sl(v1,vec_splat_u32(-8)); + v2 = vec_sl(v2,vec_splat_u32(-16)); + v3 = vec_sl(v3,vec_splat_u32(8)); + v1 = vec_or(v1,v2); + v3 = vec_or(v3,v4); + v1 = vec_or(v1,v3); + + vf1 = (vector float)vec_perm(vtab1,vtab2,(vector unsigned char)v1); + vf2 = (vector float)vec_perm(vtab3,vtab2,(vector unsigned char)v1); + vf2 = vec_sel(vf1,vf2,(vector unsigned int)vec_cmpgt((vector unsigned char)v1,v31)); + vx = vec_madd(vx,vf2,vzero); + { + vector float vratio,vout,vf5; + vf1 = vec_sel(vecb,vzero,vec_cmplt(vecb,vzero)); + vf2 = vec_sel(vx,vzero,vec_cmplt(vx,vzero)); + vf3 = vec_sel(vf1,vf2,vec_cmpgt(vf2,vf1)); + vf4 = vec_sel(vf2,vf1,vec_cmpgt(vf2,vf1)); + vf5 = vec_re(vf4); + vratio = vec_madd(vf3,vec_madd(vec_nmsub(vf4,vf5,(vector float)VINIT4ALL(1.0)),vf5,vf5),vzero); + + tmp2[0] = b; + tmp2[1] = b+1; + tmp2[2] = b+2; + tmp2[3] = b+3; + tmp3[0] = mask_add_delta(mask_idx_l[b]); + tmp3[1] = mask_add_delta(mask_idx_l[b+1]); + tmp3[2] = mask_add_delta(mask_idx_l[b+2]); + tmp3[3] = mask_add_delta(mask_idx_l[b+3]); + v1 = vec_ld(0,tmp2); + v1 = vec_sub(vkk2,v1); + v2 = vec_ld(0,tmp3); + v1 = vec_abs(v1); + v5 = (vector signed int)vec_cmpgt(v1,v2); + v3 = (vector signed int)vec_cmpge(vratio,vmamax1); + + vf4 = vec_add(vf1,vf2); + if(vec_any_eq(vec_or(v5,v3),(vector signed int)vzero)) { + vf3 = fast_log10_altivec_2(vratio); + v1 = vec_cts(vf3,4); + v1 = vec_sl(v1,vec_splat_u32(2)); + v2 = vec_add(v1,vec_splat_s32(1)); + v3 = vec_add(v1,vec_splat_s32(2)); + v4 = vec_add(v2,vec_splat_s32(2)); + v1 = vec_sl(v1,vec_splat_u32(-8)); + v2 = vec_sl(v2,vec_splat_u32(-16)); + v3 = vec_sl(v3,vec_splat_u32(8)); + v1 = vec_or(v1,v2); + v3 = vec_or(v3,v4); + v1 = vec_or(v1,v3); + vf3 = (vector float)vec_perm(vtable1,vtable2,(vector unsigned char)v1); + vf5 = (vector float)vec_perm(vtable3,vtable2,(vector unsigned char)v1); + vf5 = vec_sel(vf3,vf5,(vector unsigned int)vec_cmpgt((vector unsigned char)v1,v31)); + vf5 = vec_madd(vf4,vf5,vzero); + vf5 = vec_sel(vf5,vf4,vec_cmpge(vratio,vmamax1)); + } + else vf5 = vf4; + + vout = vec_sel(vf1,vf2,vec_cmpgt(vf2,vf1)); + vout = vec_sel(vout,vf4,vec_cmpgt(vmamax2,vratio)); + vout = vec_sel(vf5,vout,(vector unsigned int)v5); + vout = vec_sel(vout,vecb,(vector unsigned int)vec_cmple(vx,vzero)); + vout = vec_sel(vout,vx,(vector unsigned int)vec_cmple(vecb,vzero)); + vecb = vec_sel(vout,vecb,vec_cmpgt(vkk,vlast)); + } + vkk = vec_add(vkk,vone); + vk = vec_add(vk,vone); + } + vdd = vec_sl(vdd,(vector unsigned int)vone); + vdd_n = vec_sl(vdd_n,(vector unsigned int)vone); + vdd = vec_add(vdd,vone); + vf1 = vec_ctf(vdd,0); + vf2 = vec_ctf(vdd_n,0); + vf2 = vec_re(vf2); + vf1 = vec_madd(vf1,vf2,vzero); + vdd = vec_cts(vf1,0); + + v1 = vec_sl(vdd,vec_splat_u32(2)); + v2 = vec_add(v1,vec_splat_s32(1)); + v3 = vec_add(v1,vec_splat_s32(2)); + v4 = vec_add(v2,vec_splat_s32(2)); + v1 = vec_sl(v1,vec_splat_u32(-8)); + v2 = vec_sl(v2,vec_splat_u32(-16)); + v3 = vec_sl(v3,vec_splat_u32(8)); + v1 = vec_or(v1,v2); + v3 = vec_or(v3,v4); + v1 = vec_or(v1,v3); + + vf1 = (vector float)vec_perm(vtab1,vtab2,(vector unsigned char)v1); + vf2 = (vector float)vec_perm(vtab3,vtab2,(vector unsigned char)v1); + vf1 = vec_sel(vf1,vf2,(vector unsigned int)vec_cmpgt((vector unsigned char)v1,v31)); + vf2 = vec_ctf(vone,1); + vavgmask = vec_madd(vf1,vf2,vzero); + vecb = vec_madd(vecb,vavgmask,vzero); + + vf4 = vec_ld(0,eb_l+b); //needs to be aligned + if (psv->blocktype_old[chn & 0x01] == SHORT_TYPE) { + vf1 = vec_ld(0,psv->nb_l1[chn]+b); //needs to be aligned + vf3 = vec_madd(vf1,(vector float)VINIT4ALL(rpelev),vzero); + vf2 = vec_madd(vf4,(vector float)VINIT4ALL(NS_PREECHO_ATT2),vzero); + vf3 = vec_sel(vf2,vf3,vec_cmpgt(vf3,vzero)); + vf3 = vec_min(vecb,vf3); + //vec_st(vf3,0,thr+b); //needs to be aligned + } + else { + vf1 = vec_ld(0,psv->nb_l1[chn]+b); //needs to be aligned + vf2 = vec_ld(0,psv->nb_l2[chn]+b); //needs to be aligned + vf3 = vec_madd(vf1,(vector float)VINIT4ALL(rpelev),vzero); + vf2 = vec_madd(vf2,(vector float)VINIT4ALL(rpelev2),vzero); + vf3 = vec_sel(vzero,vf3,vec_cmpgt(vf3,vzero)); + vf2 = vec_sel(vzero,vf2,vec_cmpgt(vf2,vzero)); + if (psv->blocktype_old[chn & 0x01] == NORM_TYPE) { + vf3 = vec_min(vf3,vf2); + } + vf3 = vec_min(vecb,vf3); + //vec_st(vf3,0,thr+b); //needs to be aligned + } + vec_st(vf1,0,psv->nb_l2[chn]+b); //needs to be aligned + vec_st(vecb,0,psv->nb_l1[chn]+b); //needs to be aligned + { + vx = vec_ld(0,max+b); //needs to be aligned + vf1 = vec_ld(0,gdl->minval+b); + vx = vec_madd(vx,vf1,vzero); + vx = vec_madd(vx,vavgmask,vzero); + vf3 = vec_sel(vf3,vx,vec_cmpgt(vf3,vx)); + //vec_st(vf3,0,thr+b); //needs to be aligned + } + v1 = (vector signed int)vec_cmpgt(vmasking_lower,(vector float)VINIT4ALL(1.0f)); + vf1 = vec_madd(vf3,vmasking_lower,vzero); + vf3 = vec_sel(vf3,vf1,(vector unsigned int)v1); + vf3 = vec_sel(vf3,vf4,vec_cmpgt(vf3,vf4)); + vf1 = vec_madd(vf3,vmasking_lower,vzero); + vf3 = vec_sel(vf1,vf3,(vector unsigned int)v1); + vec_st(vf3,0,thr+b); //needs to be aligned + } +#else + b=0; +#endif + for (; b < gdl->npart; b++) { FLOAT x, ecb, avg_mask, t; FLOAT const masking_lower = gdl->masking_lower[b] * gfc->sv_qnt.masking_lower; + //fprintf(stderr,"%f\n",masking_lower); /* convolve the partitioned energy with the spreading function */ int kk = gdl->s3ind[b][0]; int const last = gdl->s3ind[b][1]; @@ -1423,11 +2078,11 @@ L3psycho_anal_vbr(lame_internal_flags * gfc, /* fft and energy calculation */ FLOAT(*wsamp_l)[BLKSIZE]; FLOAT(*wsamp_s)[3][BLKSIZE_s]; - FLOAT fftenergy[HBLKSIZE]; - FLOAT fftenergy_s[3][HBLKSIZE_s]; - FLOAT wsamp_L[2][BLKSIZE]; - FLOAT wsamp_S[2][3][BLKSIZE_s]; - FLOAT eb[4][CBANDS], thr[4][CBANDS]; + FLOAT fftenergy[HBLKSIZE] __attribute__ ((aligned (16))); + FLOAT fftenergy_s[3][HBLKSIZE_s+3] __attribute__ ((aligned (16))); + FLOAT wsamp_L[2][BLKSIZE] __attribute__ ((aligned (16))); + FLOAT wsamp_S[2][3][BLKSIZE_s] __attribute__ ((aligned (16))); + FLOAT eb[4][CBANDS] __attribute__ ((aligned (16))), thr[4][CBANDS] __attribute__ ((aligned (16))); FLOAT sub_short_factor[4][3]; FLOAT thmm; @@ -1436,7 +2091,7 @@ L3psycho_anal_vbr(lame_internal_flags * gfc, (cfg->msfix > 0.f) ? (cfg->ATH_offset_factor * gfc->ATH->adjust_factor) : 1.f; const FLOAT(*const_eb)[CBANDS] = (const FLOAT(*)[CBANDS]) eb; - const FLOAT(*const_fftenergy_s)[HBLKSIZE_s] = (const FLOAT(*)[HBLKSIZE_s]) fftenergy_s; + const FLOAT(*const_fftenergy_s)[HBLKSIZE_s+3] = (const FLOAT(*)[HBLKSIZE_s+3]) fftenergy_s; /* block type */ int ns_attacks[4][4] = { {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0} }; @@ -1824,7 +2479,7 @@ compute_bark_values(PsyConst_CB2SB_t const *gd, FLOAT sfreq, int fft_size, } static int -init_s3_values(FLOAT ** p, int (*s3ind)[2], int npart, +init_s3_values(FLOAT ** p, int (*s3ind)[4], int npart, FLOAT const *bval, FLOAT const *bval_width, FLOAT const *norm) { FLOAT s3[CBANDS][CBANDS]; diff --git libmp3lame/quantize.c libmp3lame/quantize.c index 9ba9c16..6064b29 100644 --- libmp3lame/quantize.c +++ libmp3lame/quantize.c @@ -28,6 +28,10 @@ # include #endif +#if __ALTIVEC__ +#include +#endif + #include "lame.h" #include "machine.h" #include "encoder.h" @@ -42,7 +46,32 @@ #endif +#if _ARCH_PPC64 +static inline double ppc_sqrt(double x) { + double y; + asm("fsqrt %0,%1" : "=f" (y) : "f" (x)); + return y; +} +#else +static inline double __frsqrte(double number) +{ + double y; + asm("frsqrte %0,%1" : "=f" (y) : "f" (number)); + return y; +} +static inline double ppc_sqrt(double x) { + double y; + const double halfx = 0.5 * x; + y = __frsqrte(x); + y *= 1.5 - halfx * y * y; + y *= 1.5 - halfx * y * y; + y *= 1.5 - halfx * y * y; + //y *= 1.5 - halfx * y * y; + y *= x; + return (x == 0.0) ? 0 : y; +} +#endif /* convert from L/R <-> Mid/Side */ static void @@ -72,9 +101,162 @@ ms_convert(III_side_info_t * l3_side, int gr) static void init_xrpow_core_c(gr_info * const cod_info, FLOAT xrpow[576], int upper, FLOAT * sum) { +#if __ALTIVEC__ + vector float v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15,v16,v17,v18,v19,v20; + vector float vsum,vsum2,vsum3,vsum4,vmax,vmax2,vmax3,vmax4,vzero; + vector unsigned char vc1,vc2,vc3,vc4,vc5,vperm; + vector float vconst1 = (vector float)VINIT4ALL(0.25); + vector float vconst2 = (vector float)VINIT4ALL(1.25); +#endif int i; FLOAT tmp; *sum = 0; +#if __ALTIVEC__ + vc1 = vec_splat_u8(1); + vc2 = vec_splat_u8(5); + vc3 = vec_sl(vc1,vc2); + vc4 = vec_sl(vc3,vc1); + vc5 = vec_or(vc3,vc4); + vsum = vec_xor(vsum,vsum); + vzero = vec_xor(vzero,vzero); + vmax = vec_xor(vmax,vmax); + vsum2 = vec_xor(vsum2,vsum2); + vmax2 = vec_xor(vmax2,vmax2); + vsum3 = vec_xor(vsum3,vsum3); + vmax3 = vec_xor(vmax3,vmax3); + vsum4 = vec_xor(vsum4,vsum4); + vmax4 = vec_xor(vmax4,vmax4); + + v0 = vec_ld(0,(cod_info->xr)); + vperm = vec_lvsl(0,(cod_info->xr)); + for (i = 0; i <= upper-15; i+=16) { + v1 = vec_ld(16,(cod_info->xr)+i); + v2 = vec_ld(32,(cod_info->xr)+i); + v3 = vec_ld(48,(cod_info->xr)+i); + v4 = vec_ld(64,(cod_info->xr)+i); + v5 = vec_perm(v0,v1,vperm); + v6 = vec_perm(v1,v2,vperm); + v7 = vec_perm(v2,v3,vperm); + v8 = vec_perm(v3,v4,vperm); + v0 = v4; + v9 = vec_abs(v5); + v10 = vec_abs(v6); + v11 = vec_abs(v7); + v12 = vec_abs(v8); + vsum = vec_add(vsum,v9); + vsum2 = vec_add(vsum2,v10); + vsum3 = vec_add(vsum3,v11); + vsum4 = vec_add(vsum4,v12); + v1 = vec_re(vec_rsqrte(vec_rsqrte(v9))); + v2 = vec_re(vec_rsqrte(vec_rsqrte(v10))); + v3 = vec_re(vec_rsqrte(vec_rsqrte(v11))); + v4 = vec_re(vec_rsqrte(vec_rsqrte(v12))); + v5 = (vector float)vec_cmpeq(vzero,v9); + v6 = (vector float)vec_cmpeq(vzero,v10); + v7 = (vector float)vec_cmpeq(vzero,v11); + v8 = (vector float)vec_cmpeq(vzero,v12); + v13 = vec_madd(v1,v1,vzero); + v14 = vec_madd(v2,v2,vzero); + v15 = vec_madd(v3,v3,vzero); + v16 = vec_madd(v4,v4,vzero); + v13 = vec_madd(v13,v13,vzero); + v14 = vec_madd(v14,v14,vzero); + v15 = vec_madd(v15,v15,vzero); + v16 = vec_madd(v16,v16,vzero); + v17 = vec_madd(v9,vconst1,vzero); + v18 = vec_madd(v10,vconst1,vzero); + v19 = vec_madd(v11,vconst1,vzero); + v20 = vec_madd(v12,vconst1,vzero); + v13 = vec_nmsub(v13,v17,vconst2); + v14 = vec_nmsub(v14,v18,vconst2); + v15 = vec_nmsub(v15,v19,vconst2); + v16 = vec_nmsub(v16,v20,vconst2); + v1 = vec_madd(v13,v1,vzero); + v2 = vec_madd(v14,v2,vzero); + v3 = vec_madd(v15,v3,vzero); + v4 = vec_madd(v16,v4,vzero); + v1 = vec_sel(v1,vzero,(vector unsigned int)v5); + v2 = vec_sel(v2,vzero,(vector unsigned int)v6); + v3 = vec_sel(v3,vzero,(vector unsigned int)v7); + v4 = vec_sel(v4,vzero,(vector unsigned int)v8); + v17 = vec_madd(v1,v9,vzero); + v18 = vec_madd(v2,v10,vzero); + v19 = vec_madd(v3,v11,vzero); + v20 = vec_madd(v4,v12,vzero); + vec_st(v17,0,xrpow+i); + vec_st(v18,16,xrpow+i); + vec_st(v19,32,xrpow+i); + vec_st(v20,48,xrpow+i); + vmax = vec_max(v17,vmax); + vmax2 = vec_max(v18,vmax2); + vmax3 = vec_max(v19,vmax3); + vmax4 = vec_max(v20,vmax4); + } + vmax = vec_max(vmax,vmax2); + vmax3 = vec_max(vmax3,vmax4); + vmax = vec_max(vmax,vmax3); + vsum = vec_add(vsum,vsum2); + vsum3 = vec_add(vsum3,vsum4); + vsum = vec_add(vsum,vsum3); + v1 = vec_slo(vmax,vc3); + v2 = vec_slo(vsum,vc3); + v3 = vec_max(v1,vmax); + v4 = vec_add(v2,vsum); + v5 = vec_slo(v3,vc4); + v6 = vec_slo(v4,vc4); + vmax = vec_max(v3,v5); + vsum = vec_add(v4,v6); + vmax = vec_perm(vmax,vmax,vec_lvsr(0,&(cod_info->xrpow_max))); + vsum = vec_perm(vsum,vsum,vec_lvsr(0,sum)); + vec_ste(vmax,0,&(cod_info->xrpow_max)); + vec_ste(vsum,0,sum); + + for (; i <= upper; i++) { + tmp = fabs(cod_info->xr[i]); + *sum += tmp; + xrpow[i] = sqrt(tmp * sqrt(tmp)); + + if (xrpow[i] > cod_info->xrpow_max) + cod_info->xrpow_max = xrpow[i]; + } +#else +#if(1) // will work on G3 + FLOAT tmp2,tmp3,tmp4; + + for (i = 0; i <= upper-3; i+=4) { + tmp = fabs (cod_info->xr[i]); + tmp2 = fabs (cod_info->xr[i+1]); + tmp3 = fabs (cod_info->xr[i+2]); + tmp4 = fabs (cod_info->xr[i+3]); + *sum += tmp; + *sum += tmp2; + *sum += tmp3; + *sum += tmp4; + + xrpow[i] = ppc_sqrt (tmp * ppc_sqrt(tmp)); + xrpow[i+1] = ppc_sqrt (tmp2 * ppc_sqrt(tmp2)); + xrpow[i+2] = ppc_sqrt (tmp3 * ppc_sqrt(tmp3)); + xrpow[i+3] = ppc_sqrt (tmp4 * ppc_sqrt(tmp4)); + + if (xrpow[i] > cod_info->xrpow_max) + cod_info->xrpow_max = xrpow[i]; + if (xrpow[i+1] > cod_info->xrpow_max) + cod_info->xrpow_max = xrpow[i+1]; + if (xrpow[i+2] > cod_info->xrpow_max) + cod_info->xrpow_max = xrpow[i+2]; + if (xrpow[i+3] > cod_info->xrpow_max) + cod_info->xrpow_max = xrpow[i+3]; + } + + for (; i <= upper; i++) { + tmp = fabs(cod_info->xr[i]); + *sum += tmp; + xrpow[i] = ppc_sqrt(tmp * ppc_sqrt(tmp)); + + if (xrpow[i] > cod_info->xrpow_max) + cod_info->xrpow_max = xrpow[i]; + } +#else for (i = 0; i <= upper; ++i) { tmp = fabs(cod_info->xr[i]); *sum += tmp; @@ -83,6 +265,8 @@ init_xrpow_core_c(gr_info * const cod_info, FLOAT xrpow[576], int upper, FLOAT * if (xrpow[i] > cod_info->xrpow_max) cod_info->xrpow_max = xrpow[i]; } +#endif +#endif } @@ -1495,7 +1679,7 @@ VBR_old_iteration_loop(lame_internal_flags * gfc, const FLOAT pe[2][2], EncResult_t *const eov = &gfc->ov_enc; FLOAT l3_xmin[2][2][SFBMAX]; - FLOAT xrpow[576]; + FLOAT xrpow[576] __attribute__ ((aligned (16))); int bands[2][2]; int frameBits[15]; int used_bits; @@ -1650,7 +1834,7 @@ VBR_new_iteration_loop(lame_internal_flags * gfc, const FLOAT pe[2][2], EncResult_t *const eov = &gfc->ov_enc; FLOAT l3_xmin[2][2][SFBMAX]; - FLOAT xrpow[2][2][576]; + FLOAT xrpow[2][2][576] __attribute__ ((aligned (16))); int frameBits[15]; int used_bits; int max_bits[2][2]; @@ -1904,7 +2088,7 @@ ABR_iteration_loop(lame_internal_flags * gfc, const FLOAT pe[2][2], SessionConfig_t const *const cfg = &gfc->cfg; EncResult_t *const eov = &gfc->ov_enc; FLOAT l3_xmin[SFBMAX]; - FLOAT xrpow[576]; + FLOAT xrpow[576] __attribute__ ((aligned (16))); int targ_bits[2][2]; int mean_bits, max_frame_bits; int ch, gr, ath_over; @@ -1991,7 +2175,7 @@ CBR_iteration_loop(lame_internal_flags * gfc, const FLOAT pe[2][2], { SessionConfig_t const *const cfg = &gfc->cfg; FLOAT l3_xmin[SFBMAX]; - FLOAT xrpow[576]; + FLOAT xrpow[576] __attribute__ ((aligned (16))); int targ_bits[2]; int mean_bits, max_bits; int gr, ch; diff --git libmp3lame/quantize_pvt.c libmp3lame/quantize_pvt.c index d8d6447..0a346f9 100644 --- libmp3lame/quantize_pvt.c +++ libmp3lame/quantize_pvt.c @@ -27,6 +27,11 @@ # include #endif +#if __ALTIVEC__ +#undef TAKEHIRO_IEEE754_HACK +#include +#endif + #include "lame.h" #include "machine.h" @@ -751,6 +756,39 @@ calc_xmin(lame_internal_flags const *gfc, static FLOAT calc_noise_core_c(const gr_info * const cod_info, int *startline, int l, FLOAT step) { +#if __ALTIVEC__ + vector float v1,v2,v3,v4,v5,v6,v7,v8,v9,va,vb,vstep,vzero,vnoise1,vnoise2,vix01; + vector unsigned char vperm1,vperm2,vperm5,vperm6; + vector signed int vx1,vx2,vx3,vx4,vx5,vx6,vx7,vshamt,vone; +#if _ARCH_PPC64 + vector unsigned int vmask1,vmask2,vmask3; + vector float v10,v11,v12,v13,v14,v15,v16,v17; +#else + vector unsigned char vc1,vc2,vc3,vc4,vc5,vc6,vperm3,vperm4,vmask; +#endif + float temp[4] __attribute__ ((aligned (16))); + + temp[0] = step; + vstep = vec_ld(0,temp); + vzero = vec_xor(vzero,vzero); + vperm6 = (vector unsigned char)VINIT16(0,0,3,19,0,0,7,23,0,0,11,27,0,0,15,31); + vperm5 = vec_sld(vperm6,vperm6,2); +#if _ARCH_PPC64 + vmask1 = vec_splat_u32(-1); + vmask2 = vec_sld((vector unsigned int)vzero,vmask1,8); + vmask3 = vec_sld((vector unsigned int)vzero,vmask1,4); + vmask1 = vec_sld((vector unsigned int)vzero,vmask1,12); +#else + vperm3 = (vector unsigned char)VINIT16(0,0,0,0,0,0,0,0,0,1,2,3,16,17,18,19); + vperm4 = vec_sld(vperm3,(vector unsigned char)vzero,8); + vmask = (vector unsigned char)VINIT16ALL(16); +#endif + vstep = vec_splat(vstep,0); + vnoise1 = vec_xor(vnoise1,vnoise1); + vnoise2 = vec_xor(vnoise2,vnoise2); + vone = vec_splat_s32(1); + vshamt = vec_splat_s32(2); +#endif FLOAT noise = 0; int j = *startline; const int *const ix = cod_info->l3_enc; @@ -767,9 +805,55 @@ calc_noise_core_c(const gr_info * const cod_info, int *startline, int l, FLOAT s } } else if (j > cod_info->big_values) { - FLOAT ix01[2]; + FLOAT ix01[4] __attribute__ ((aligned (16))); ix01[0] = 0; ix01[1] = step; +#if __ALTIVEC__ + vix01 = vec_ld(0,ix01); + v1 = vec_ld(0,cod_info->xr+j); + vperm1 = vec_lvsl(0,cod_info->xr+j); + vx1 = vec_ld(0,ix+j); + vperm2 = vec_lvsl(0,ix+j); + for(;l>1;l-=2) { + v2 = vec_ld(16,cod_info->xr+j); + vx2 = vec_ld(16,ix+j); + v3 = vec_perm(v1,v2,vperm1); + vx3 = vec_perm(vx1,vx2,vperm2); + va = vec_abs(v3); + v1 = v2; + vx1 = vx2; + + vx4 = vec_sl(vx3,(vector unsigned int)vshamt); + vx5 = vec_add(vx4,vone); + vx6 = vec_add(vx4,vshamt); + vx7 = vec_add(vx5,vshamt); + vx2 = vec_perm(vx4,vx5,vperm5); + vx3 = vec_perm(vx6,vx7,vperm6); + vx4 = vec_or(vx2,vx3); + + v2 = vec_perm(vix01,vix01,(vector unsigned char)vx4); + va = vec_sub(va,v2); + + vnoise1 = vec_madd(va,va,vnoise1); + + j += 4; + } + v1 = vec_sld(vnoise1,vnoise1,8); + v2 = vec_add(vnoise1,v1); + v3 = vec_sld(v2,v2,4); + v4 = vec_add(v2,v3); + v5 = vec_perm(v4,v4,vec_lvsr(0,&noise)); + vec_ste(v5,0,&noise); + if(l) { + FLOAT temp; + temp = fabs(cod_info->xr[j]) - ix01[ix[j]]; + j++; + noise += temp * temp; + temp = fabs(cod_info->xr[j]) - ix01[ix[j]]; + j++; + noise += temp * temp; + } +#else while (l--) { FLOAT temp; temp = fabs(cod_info->xr[j]) - ix01[ix[j]]; @@ -779,8 +863,138 @@ calc_noise_core_c(const gr_info * const cod_info, int *startline, int l, FLOAT s j++; noise += temp * temp; } +#endif } else { +#if __ALTIVEC__ + vperm1 = vec_lvsl(0,cod_info->xr+j); + v1 = vec_ld(0,cod_info->xr+j); + for(;l>3;l-=4) { + v2 = vec_ld(16,cod_info->xr+j); + v3 = vec_ld(32,cod_info->xr+j); + v4 = vec_perm(v1,v2,vperm1); + v5 = vec_perm(v2,v3,vperm1); + va = vec_abs(v4); + vb = vec_abs(v5); + v1 = v3; + +#if _ARCH_PPC64 + v2 = vec_lde(0,pow43+ix[j]); + v6 = vec_lde(0,pow43+ix[j+1]); + v10 = vec_lde(0,pow43+ix[j+2]); + v14 = vec_lde(0,pow43+ix[j+3]); + v4 = vec_perm(v2,v2,vec_lvsl(0,pow43+ix[j])); + v8 = vec_perm(v6,v6,vec_lvsl(-4,pow43+ix[j+1])); + v12 = vec_perm(v10,v10,vec_lvsl(-8,pow43+ix[j+2])); + v16 = vec_perm(v14,v14,vec_lvsl(-12,pow43+ix[j+3])); + v4 = vec_sel(v4,v8,vmask1); + v4 = vec_sel(v4,v12,vmask2); + v4 = vec_sel(v4,v16,vmask3); + va = vec_nmsub(v4,vstep,va); +#else + vc1 = vec_lvsl(0,pow43+ix[j]); + vc2 = vec_lvsl(0,pow43+ix[j+1]); + vc3 = vec_lvsl(0,pow43+ix[j+2]); + vc4 = vec_lvsl(0,pow43+ix[j+3]); + vc2 = vec_or(vc2,vmask); + vc4 = vec_or(vc4,vmask); + v2 = vec_lde(0,pow43+ix[j]); + v3 = vec_lde(0,pow43+ix[j+1]); + v4 = vec_lde(0,pow43+ix[j+2]); + v5 = vec_lde(0,pow43+ix[j+3]); + vc5 = vec_perm(vc1,vc2,vperm3); + vc6 = vec_perm(vc3,vc4,vperm4); + v6 = vec_perm(v2,v3,vc5); + v7 = vec_perm(v4,v5,vc6); + v8 = vec_sld(v6,v7,8); + va = vec_nmsub(v8,vstep,va); +#endif + j+=4; + +#if _ARCH_PPC64 + v3 = vec_lde(0,pow43+ix[j]); + v7 = vec_lde(0,pow43+ix[j+1]); + v11 = vec_lde(0,pow43+ix[j+2]); + v15 = vec_lde(0,pow43+ix[j+3]); + v5 = vec_perm(v3,v3,vec_lvsl(0,pow43+ix[j])); + v9 = vec_perm(v7,v7,vec_lvsl(-4,pow43+ix[j+1])); + v13 = vec_perm(v11,v11,vec_lvsl(-8,pow43+ix[j+2])); + v17 = vec_perm(v15,v15,vec_lvsl(-12,pow43+ix[j+3])); + v5 = vec_sel(v5,v9,vmask1); + v5 = vec_sel(v5,v13,vmask2); + v5 = vec_sel(v5,v17,vmask3); + vb = vec_nmsub(v5,vstep,vb); +#else + vc1 = vec_lvsl(0,pow43+ix[j]); + vc2 = vec_lvsl(0,pow43+ix[j+1]); + vc3 = vec_lvsl(0,pow43+ix[j+2]); + vc4 = vec_lvsl(0,pow43+ix[j+3]); + vc2 = vec_or(vc2,vmask); + vc4 = vec_or(vc4,vmask); + v2 = vec_lde(0,pow43+ix[j]); + v3 = vec_lde(0,pow43+ix[j+1]); + v4 = vec_lde(0,pow43+ix[j+2]); + v5 = vec_lde(0,pow43+ix[j+3]); + vc5 = vec_perm(vc1,vc2,vperm3); + vc6 = vec_perm(vc3,vc4,vperm4); + v6 = vec_perm(v2,v3,vc5); + v7 = vec_perm(v4,v5,vc6); + v8 = vec_sld(v6,v7,8); + vb = vec_nmsub(v8,vstep,vb); +#endif + + vnoise1 = vec_madd(va,va,vnoise1); + vnoise2 = vec_madd(vb,vb,vnoise2); + + j+=4; + } + vnoise1 = vec_add(vnoise1,vnoise2); + + for(;l>1;l-=2) { + v2 = vec_ld(16,cod_info->xr+j); + v4 = vec_perm(v1,v2,vperm1); + va = vec_abs(v4); + v1 = v2; + + v2 = vec_lde(0,pow43+ix[j]); + v3 = vec_lde(0,pow43+ix[j+1]); + v4 = vec_lde(0,pow43+ix[j+2]); + v5 = vec_lde(0,pow43+ix[j+3]); + v6 = vec_perm(v2,v2,vec_lvsl(0,pow43+ix[j])); + v7 = vec_perm(v3,v3,vec_lvsl(-4,pow43+ix[j+1])); + v8 = vec_perm(v4,v4,vec_lvsl(-8,pow43+ix[j+2])); + v9 = vec_perm(v5,v5,vec_lvsl(-12,pow43+ix[j+3])); +#if _ARCH_PPC64 + v6 = vec_sel(v6,v7,vmask1); + v6 = vec_sel(v6,v8,vmask2); + v6 = vec_sel(v6,v9,vmask3); +#else + v6 = vec_or(v6,v7); + v6 = vec_or(v6,v8); + v6 = vec_or(v6,v9); +#endif + va = vec_nmsub(v6,vstep,va); + + vnoise1 = vec_madd(va,va,vnoise1); + + j += 4; + } + v1 = vec_sld(vnoise1,vnoise1,8); + v2 = vec_add(vnoise1,v1); + v3 = vec_sld(v2,v2,4); + v4 = vec_add(v2,v3); + v5 = vec_perm(v4,v4,vec_lvsr(0,&noise)); + vec_ste(v5,0,&noise); + if(l) { + FLOAT temp; + temp = fabs(cod_info->xr[j]) - pow43[ix[j]] * step; + j++; + noise += temp * temp; + temp = fabs(cod_info->xr[j]) - pow43[ix[j]] * step; + j++; + noise += temp * temp; + } +#else while (l--) { FLOAT temp; temp = fabs(cod_info->xr[j]) - pow43[ix[j]] * step; @@ -790,6 +1004,7 @@ calc_noise_core_c(const gr_info * const cod_info, int *startline, int l, FLOAT s j++; noise += temp * temp; } +#endif } *startline = j; diff --git libmp3lame/tables.c libmp3lame/tables.c index a023099..124a87c 100644 --- libmp3lame/tables.c +++ libmp3lame/tables.c @@ -406,7 +406,7 @@ const uint8_t t33l[] = { }; -const struct huffcodetab ht[HTN] = { +const struct huffcodetab ht[HTN] __attribute__ ((aligned (16))) = { /* xlen, linmax, table, hlen */ {0, 0, NULL, NULL}, {2, 0, t1HB, t1l}, diff --git libmp3lame/takehiro.c libmp3lame/takehiro.c index 67aba1b..368ff2e 100644 --- libmp3lame/takehiro.c +++ libmp3lame/takehiro.c @@ -26,6 +26,10 @@ # include #endif +#if __ALTIVEC__ +#undef TAKEHIRO_IEEE754_HACK +#include +#endif #include "lame.h" #include "machine.h" @@ -222,6 +226,150 @@ quantize_lines_xrpow(unsigned int l, FLOAT istep, const FLOAT * xp, int *pi) static void quantize_lines_xrpow(unsigned int l, FLOAT istep, const FLOAT * xr, int *ix) { +#if __ALTIVEC__ + vector float v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,va,vb,vistep,vzero; + vector signed int vx1,vx2,vx3,vx4,vprev; + vector unsigned char vperm1,vperm2; + const vector float const1 = (vector float)VINIT4(0.4053964553387788,3.404263724373839,5.465086767819913,1.0); + const vector float const2 = (vector float)VINIT4(7.719205369637751,10.93017829043677,0,0); +#ifndef _ARCH_PPC64 + unsigned int temp[4] __attribute__ ((aligned (16))); +#endif + float temp2[4] __attribute__ ((aligned (16))); + temp2[0] = istep; + vistep = vec_ld(0,temp2); + vzero = vec_xor(vzero,vzero); + vistep = vec_splat(vistep,0); + + l = l >> 1; + + vperm1 = vec_lvsl(0,xr); + vperm2 = vec_lvsr(0,ix); + v1 = vec_ld(0,xr); + vx1 = vec_ld(-16,ix); + vx2 = vec_ld(0,ix); + vprev = vec_perm(vx1,vx2,vec_lvsl(0,ix)); + for(;l>3;l-=4) { + v2 = vec_ld(16,xr); + v3 = vec_ld(32,xr); + v4 = vec_perm(v1,v2,vperm1); + v5 = vec_perm(v2,v3,vperm1); + va = vec_madd(v4,vistep,vzero); + vb = vec_madd(v5,vistep,vzero); + v1 = v3; + + v2 = vec_floor(va); + v3 = vec_floor(vb); + v4 = vec_splat(const1,2); + v5 = vec_splat(const1,1); + v6 = vec_splat(const2,1); + v7 = vec_splat(const2,0); + v8 = vec_madd(v2,v4,v5); + v9 = vec_madd(v3,v4,v5); + v10 = vec_madd(v2,v6,v7); + v11 = vec_madd(v3,v6,v7); + v4 = vec_splat(const1,0); + v5 = vec_splat(const1,3); + v8 = vec_madd(v8,v2,v4); + v9 = vec_madd(v9,v3,v4); + v10 = vec_madd(v10,v2,v5); + v11 = vec_madd(v11,v3,v5); + v6 = vec_re(v10); + v7 = vec_re(v11); + v10 = vec_nmsub(v10,v6,v5); + v11 = vec_nmsub(v11,v7,v5); + v10 = vec_madd(v10,v6,v6); + v11 = vec_madd(v11,v7,v7); + va = vec_madd(v8,v10,va); + vb = vec_madd(v9,v11,vb); + + vx1 = vec_cts(va,0); + vx2 = vec_cts(vb,0); + + vx3 = vec_perm(vprev,vx1,vperm2); + vx4 = vec_perm(vx1,vx2,vperm2); + vec_st(vx3,0,ix); + vec_st(vx4,16,ix); + vprev = vx2; + xr += 8; + ix += 8; + } + vx1 = vec_ld(0,ix); + vx2 = vec_ld(16,ix); + vx3 = vec_perm(vx1,vx2,vec_lvsl(0,ix)); + vx4 = vec_perm(vprev,vx3,vperm2); + vec_st(vx4,0,ix); + +#if _ARCH_PPC64 + for(;l>1;l-=2) { + FLOAT x0, x1, x2, x3; + int rx0, rx1, rx2, rx3; + + x0 = *xr++ * istep; + x1 = *xr++ * istep; + XRPOW_FTOI(x0, rx0); + x2 = *xr++ * istep; + XRPOW_FTOI(x1, rx1); + x3 = *xr++ * istep; + XRPOW_FTOI(x2, rx2); + x0 += QUANTFAC(rx0); + XRPOW_FTOI(x3, rx3); + x1 += QUANTFAC(rx1); + XRPOW_FTOI(x0, *ix++); + x2 += QUANTFAC(rx2); + XRPOW_FTOI(x1, *ix++); + x3 += QUANTFAC(rx3); + XRPOW_FTOI(x2, *ix++); + XRPOW_FTOI(x3, *ix++); + } +#else + for(;l>1;l-=2) { + v2 = vec_ld(16,xr); + v4 = vec_perm(v1,v2,vperm1); + v1 = v2; + va = vec_madd(v4,vistep,vzero); + vx1 = vec_cts(va,0); + vec_st((vector unsigned int)vx1,0,temp); + v2 = vec_lde(0,adj43+temp[0]); + v3 = vec_lde(0,adj43+temp[1]); + v4 = vec_lde(0,adj43+temp[2]); + v5 = vec_lde(0,adj43+temp[3]); + v6 = vec_perm(v2,v2,vec_lvsl(0,adj43+temp[0])); + v7 = vec_perm(v3,v3,vec_lvsl(-4,adj43+temp[1])); + v8 = vec_perm(v4,v4,vec_lvsl(-8,adj43+temp[2])); + v9 = vec_perm(v5,v5,vec_lvsl(-12,adj43+temp[3])); + v6 = vec_or(v6,v7); + v6 = vec_or(v6,v8); + v6 = vec_or(v6,v9); + va = vec_add(va,v6); + vx1 = vec_cts(va,0); + vx3 = vec_perm(vprev,vx1,vperm2); + vec_st(vx3,0,ix); + vprev = vx1; + xr += 4; + ix += 4; + } + vx1 = vec_ld(0,ix); + vx2 = vec_ld(16,ix); + vx3 = vec_perm(vx1,vx2,vec_lvsl(0,ix)); + vx4 = vec_perm(vprev,vx3,vperm2); + vec_st(vx4,0,ix); +#endif + + if (l) { + FLOAT x0, x1; + int rx0, rx1; + + x0 = *xr++ * istep; + x1 = *xr++ * istep; + XRPOW_FTOI(x0, rx0); + XRPOW_FTOI(x1, rx1); + x0 += QUANTFAC(rx0); + x1 += QUANTFAC(rx1); + XRPOW_FTOI(x0, *ix++); + XRPOW_FTOI(x1, *ix++); + } +#else unsigned int remaining; assert(l > 0); @@ -263,7 +411,7 @@ quantize_lines_xrpow(unsigned int l, FLOAT istep, const FLOAT * xr, int *ix) XRPOW_FTOI(x0, *ix++); XRPOW_FTOI(x1, *ix++); } - +#endif } @@ -420,6 +568,60 @@ quantize_xrpow(const FLOAT * xp, int *pi, FLOAT istep, gr_info const *const cod_ /* ix_max */ /*************************************************************************/ +#if __ALTIVEC__ +int +ix_max_vec(const int *ix, const int *end) +{ + int vresult[4] __attribute__ ((aligned (16))); + int max1=0, max2=0; + vector signed int v1, v2, v3, v4, v5, v6, v7, vmax; + vector unsigned char vmask,vc1,vc2,vc3,vc4; + + if(end - ix < 8) goto normal; + int i = (end-ix)/4; + int remain = (end-ix)%4; + vc1 = vec_splat_u8(1); + vc2 = vec_splat_u8(5); + vc3 = vec_sl(vc1,vc2); + vc4 = vec_sl(vc3,vc1); + + v1 = vec_ld(0, ix); + vmask = vec_lvsl(0, ix); + vmax = vec_xor(vmax, vmax); + + while(i--) { + v2 = vec_ld(16, ix); + v3 = vec_perm(v1, v2, vmask); + v1 = v2; + vmax = vec_max(vmax,v3); + ix += 4; + } + + v4 = vec_slo(vmax,vc3); + v5 = vec_max(vmax,v4); + v6 = vec_slo(v5,vc4); + v7 = vec_max(v5,v6); + vec_st(v7,0,vresult); + + max1 = vresult[0]; + if(!remain) return max1; + //max2 = vresult[2]; + /*if(vresult[2] > max1) max1 = vresult[2]; + if(vresult[3] > max2) max2 = vresult[3];*/ + + normal: + + do{ + int x1 = *ix++; + int x2 = *ix++; + if (max1 < x1) max1 = x1; + if (max2 < x2) max2 = x2; + } while (ix < end); + if(max1 < max2) max1 = max2; + + return max1; +} +#else static int ix_max(const int *ix, const int *end) { @@ -438,14 +640,14 @@ ix_max(const int *ix, const int *end) max1 = max2; return max1; } +#endif - - +#if !defined(__ALTIVEC__) || (defined(__ALTIVEC__) && !defined(_ARCH_PPC64)) static int count_bit_ESC(const int *ix, const int *const end, int t1, const int t2, unsigned int *const s) { @@ -481,6 +683,7 @@ count_bit_ESC(const int *ix, const int *const end, int t1, const int t2, unsigne *s += sum; return t1; } +#endif static int @@ -507,6 +710,7 @@ static const int huf_tbl_noESC[] = { }; +#if !defined(__ALTIVEC__) static int count_bit_noESC_from2(const int *ix, const int *end, int max, unsigned int *s) { @@ -533,6 +737,7 @@ count_bit_noESC_from2(const int *ix, const int *end, int max, unsigned int *s) *s += sum; return t1; } +#endif inline static int @@ -572,6 +777,651 @@ count_bit_noESC_from3(const int *ix, const int *end, int max, unsigned int * s) return t; } +#if __ALTIVEC__ +#if _ARCH_PPC64 +static int +count_bit_ESC_altivec(const int *ix, const int *const end, int t1, const int t2, int *const s) +{ + /* ESC-table is used */ + int const linbits = ht[t1].xlen * 65536 + ht[t2].xlen; + int sum = 0, sum2; + vector signed int v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15,v16; + vector unsigned int vsum; + vector unsigned char vmask,vperm1,vperm2,vshamt; + vector unsigned char vzero,vs1,vs2,vs3,vs4,vs5,vs6,vlimit1,vlimit2,vone; + unsigned char tmp[16] __attribute__ ((aligned (16))); + unsigned int tmp2[4] __attribute__ ((aligned (16))); + + vperm1 = (vector unsigned char)VINIT16(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); + vperm2 = (vector unsigned char)VINIT16(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); + vlimit1 = vec_splat_u8(14); + vlimit2 = vec_splat_u8(15); + vone = vec_splat_u8(1); + vshamt = vec_splat_u8(4); + vzero = vec_xor(vzero,vzero); + vsum = vec_xor(vsum,vsum); + + if((int)(end - ix) < 32) goto normal; + v0 = vec_ld(0,ix); + vmask = vec_lvsl(0,ix); + do { + v1 = vec_ld(16,ix); + v2 = vec_ld(32,ix); + v3 = vec_ld(48,ix); + v4 = vec_ld(64,ix); + v5 = vec_ld(80,ix); + v6 = vec_ld(96,ix); + v7 = vec_ld(112,ix); + v8 = vec_ld(128,ix); + v9 = vec_perm(v0,v1,vmask); + v10 = vec_perm(v1,v2,vmask); + v11 = vec_perm(v2,v3,vmask); + v12 = vec_perm(v3,v4,vmask); + v13 = vec_perm(v4,v5,vmask); + v14 = vec_perm(v5,v6,vmask); + v15 = vec_perm(v6,v7,vmask); + v16 = vec_perm(v7,v8,vmask); + v0 = v8; + v1 = vec_perm(v9,v10,vperm1); + v2 = vec_perm(v9,v10,vperm2); + v3 = vec_perm(v11,v12,vperm1); + v4 = vec_perm(v11,v12,vperm2); + v5 = vec_perm(v13,v14,vperm1); + v6 = vec_perm(v13,v14,vperm2); + v7 = vec_perm(v15,v16,vperm1); + v8 = vec_perm(v15,v16,vperm2); + + v1 = (vector signed int)vec_packs(v1,v3); + v2 = (vector signed int)vec_packs(v2,v4); + v3 = (vector signed int)vec_packs(v5,v7); + v4 = (vector signed int)vec_packs(v6,v8); + vs1 = vec_packs((vector unsigned short)v1,(vector unsigned short)v3); + vs2 = vec_packs((vector unsigned short)v2,(vector unsigned short)v4); + vs3 = vec_sel(vs1,vlimit2,vec_cmpgt(vs1,vlimit1)); + vs4 = vec_sel(vs2,vlimit2,vec_cmpgt(vs2,vlimit1)); + vs5 = vec_sel(vzero,vone,vec_cmpgt(vs1,vlimit1)); + vs6 = vec_sel(vzero,vone,vec_cmpgt(vs2,vlimit1)); + vs5 = vec_add(vs5,vs6); + vsum = vec_sum4s(vs5,vsum); + vs3 = vec_sl(vs3,vshamt); + vs3 = vec_add(vs3,vs4); + vec_st(vs3,0,tmp); + + sum += largetbl[tmp[0]]; + sum += largetbl[tmp[1]]; + sum += largetbl[tmp[2]]; + sum += largetbl[tmp[3]]; + sum += largetbl[tmp[4]]; + sum += largetbl[tmp[5]]; + sum += largetbl[tmp[6]]; + sum += largetbl[tmp[7]]; + sum += largetbl[tmp[8]]; + sum += largetbl[tmp[9]]; + sum += largetbl[tmp[10]]; + sum += largetbl[tmp[11]]; + sum += largetbl[tmp[12]]; + sum += largetbl[tmp[13]]; + sum += largetbl[tmp[14]]; + sum += largetbl[tmp[15]]; + + ix += 32; + } while(ix < end-31); + + vsum = (vector unsigned int)vec_sums((vector signed int)vsum,(vector signed int)vzero); + vec_st(vsum,0,tmp2); + sum += tmp2[3] * linbits; + + while (ix < end) { + unsigned int x = *ix++; + unsigned int y = *ix++; + + if (x >= 15u) { + x = 15u; + sum += linbits; + } + if (y >= 15u) { + y = 15u; + sum += linbits; + } + x <<= 4u; + x += y; + sum += largetbl[x]; + } + goto end; + +normal: + do { + unsigned int x = *ix++; + unsigned int y = *ix++; + + if (x >= 15u) { + x = 15u; + sum += linbits; + } + if (y >= 15u) { + y = 15u; + sum += linbits; + } + x <<= 4u; + x += y; + sum += largetbl[x]; + } while (ix < end); + +end: + sum2 = sum & 0xffffu; + sum >>= 16u; + + if (sum > sum2) { + sum = sum2; + t1 = t2; + } + + *s += sum; + return t1; +} +#endif + +inline static int +count_bit_noESC_from2_altivec1(const int *ix, const int *end, int max, unsigned int *s) +{ + int t1 = huf_tbl_noESC[max - 1]; + /* No ESC-words */ + unsigned int sum = 0; + int sum1, sum2; + const unsigned int xlen = 3; + const unsigned int *table = table23; + vector signed int v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15,v16; + vector signed int vx1,vx2,vx3,vx4,vx5,vx6,vxlen,vzero,vsum1,vsum2; + vector unsigned char vmask,vperm1,vperm2,vx; + vector unsigned char vhlen1,vhlen2; + vector signed char vs1,vs2; + + vhlen1 = (vector unsigned char)VINIT16(1,4,7,4,5,7,6,7,8,0,0,0,0,0,0,0); + vhlen2 = (vector unsigned char)VINIT16(2,3,7,4,4,7,6,7,8,0,0,0,0,0,0,0); + vperm1 = (vector unsigned char)VINIT16(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); + vperm2 = (vector unsigned char)VINIT16(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); + vxlen = vec_splat_s32(3); + vzero = vec_xor(vzero,vzero); + vsum1 = vec_xor(vsum1,vsum1); + vsum2 = vec_xor(vsum2,vsum2); + + if((int)(end - ix) < 32) goto normal; + v0 = vec_ld(0,ix); + vmask = vec_lvsl(0,ix); + do { + v1 = vec_ld(16,ix); + v2 = vec_ld(32,ix); + v3 = vec_ld(48,ix); + v4 = vec_ld(64,ix); + v5 = vec_ld(80,ix); + v6 = vec_ld(96,ix); + v7 = vec_ld(112,ix); + v8 = vec_ld(128,ix); + v9 = vec_perm(v0,v1,vmask); + v10 = vec_perm(v1,v2,vmask); + v11 = vec_perm(v2,v3,vmask); + v12 = vec_perm(v3,v4,vmask); + v13 = vec_perm(v4,v5,vmask); + v14 = vec_perm(v5,v6,vmask); + v15 = vec_perm(v6,v7,vmask); + v16 = vec_perm(v7,v8,vmask); + v0 = v8; + v1 = vec_perm(v9,v10,vperm1); + v2 = vec_perm(v9,v10,vperm2); + v3 = vec_perm(v11,v12,vperm1); + v4 = vec_perm(v11,v12,vperm2); + v5 = vec_perm(v13,v14,vperm1); + v6 = vec_perm(v13,v14,vperm2); + v7 = vec_perm(v15,v16,vperm1); + v8 = vec_perm(v15,v16,vperm2); + vx1 = (vector signed int)vec_mladd((vector unsigned short)v1,(vector unsigned short)vxlen,(vector unsigned short)v2); + vx2 = (vector signed int)vec_mladd((vector unsigned short)v3,(vector unsigned short)vxlen,(vector unsigned short)v4); + vx3 = (vector signed int)vec_pack(vx1,vx2); + vx4 = (vector signed int)vec_mladd((vector unsigned short)v5,(vector unsigned short)vxlen,(vector unsigned short)v6); + vx5 = (vector signed int)vec_mladd((vector unsigned short)v7,(vector unsigned short)vxlen,(vector unsigned short)v8); + vx6 = (vector signed int)vec_pack(vx4,vx5); + vx = (vector unsigned char)vec_pack((vector unsigned short)vx3,(vector unsigned short)vx6); + + vs1 = (vector signed char)vec_perm(vhlen1,vhlen1,vx); + vs2 = (vector signed char)vec_perm(vhlen2,vhlen2,vx); + + vsum1 = vec_sum4s(vs1,vsum1); + vsum2 = vec_sum4s(vs2,vsum2); + + ix += 32; + } while(ix < end-31); + + vsum1 = vec_sums(vsum1,vzero); + vsum2 = vec_sums(vsum2,vzero); + + vsum1 = vec_perm(vsum1,vsum1,vec_lvsr(4,&sum1)); + vsum2 = vec_perm(vsum2,vsum2,vec_lvsr(4,&sum2)); + vec_ste(vsum1,0,&sum1); + vec_ste(vsum2,0,&sum2); + + while (ix < end) { + unsigned int const x0 = *ix++; + unsigned int const x1 = *ix++; + sum += table[ x0 * xlen + x1 ]; + } + + sum2 += sum & 0xffffu; + sum = (sum>>16u) + sum1; + + goto end; + +normal: + do { + unsigned int const x0 = *ix++; + unsigned int const x1 = *ix++; + sum += table[ x0 * xlen + x1 ]; + } while (ix < end); + + sum2 = sum & 0xffffu; + sum >>= 16u; + +end: + if (sum > sum2) { + sum = sum2; + t1++; + } + + *s += sum; + return t1; +} + +inline static int +count_bit_noESC_from2_altivec2(const int *ix, const int *end, int max, unsigned int *s) +{ + int t1 = huf_tbl_noESC[max - 1]; + /* No ESC-words */ + unsigned int sum = 0; + int sum1, sum2; + const unsigned int xlen = 4; + const unsigned int *table = table56; + vector signed int v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15,v16; + vector signed int vx1,vx2,vx3,vx4,vx5,vx6,vxlen,vzero,vsum1,vsum2; + vector unsigned char vmask,vperm1,vperm2,vx; + vector unsigned char vhlen1,vhlen2; + vector signed char vs1,vs2; + + vhlen1 = (vector unsigned char)VINIT16(1,4,7,8,4,5,8,9,7,8,9,10,8,8,9,10); + vhlen2 = (vector unsigned char)VINIT16(3,4,6,8,4,4,6,7,5,6,7,8,7,7,8,9); + vperm1 = (vector unsigned char)VINIT16(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); + vperm2 = (vector unsigned char)VINIT16(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); + vxlen = vec_splat_s32(4); + vzero = vec_xor(vzero,vzero); + vsum1 = vec_xor(vsum1,vsum1); + vsum2 = vec_xor(vsum2,vsum2); + + if((int)(end - ix) < 32) goto normal; + v0 = vec_ld(0,ix); + vmask = vec_lvsl(0,ix); + do { + v1 = vec_ld(16,ix); + v2 = vec_ld(32,ix); + v3 = vec_ld(48,ix); + v4 = vec_ld(64,ix); + v5 = vec_ld(80,ix); + v6 = vec_ld(96,ix); + v7 = vec_ld(112,ix); + v8 = vec_ld(128,ix); + v9 = vec_perm(v0,v1,vmask); + v10 = vec_perm(v1,v2,vmask); + v11 = vec_perm(v2,v3,vmask); + v12 = vec_perm(v3,v4,vmask); + v13 = vec_perm(v4,v5,vmask); + v14 = vec_perm(v5,v6,vmask); + v15 = vec_perm(v6,v7,vmask); + v16 = vec_perm(v7,v8,vmask); + v0 = v8; + v1 = vec_perm(v9,v10,vperm1); + v2 = vec_perm(v9,v10,vperm2); + v3 = vec_perm(v11,v12,vperm1); + v4 = vec_perm(v11,v12,vperm2); + v5 = vec_perm(v13,v14,vperm1); + v6 = vec_perm(v13,v14,vperm2); + v7 = vec_perm(v15,v16,vperm1); + v8 = vec_perm(v15,v16,vperm2); + + vx1 = (vector signed int)vec_mladd((vector unsigned short)v1,(vector unsigned short)vxlen,(vector unsigned short)v2); + vx2 = (vector signed int)vec_mladd((vector unsigned short)v3,(vector unsigned short)vxlen,(vector unsigned short)v4); + vx3 = (vector signed int)vec_pack(vx1,vx2); + vx4 = (vector signed int)vec_mladd((vector unsigned short)v5,(vector unsigned short)vxlen,(vector unsigned short)v6); + vx5 = (vector signed int)vec_mladd((vector unsigned short)v7,(vector unsigned short)vxlen,(vector unsigned short)v8); + vx6 = (vector signed int)vec_pack(vx4,vx5); + vx = (vector unsigned char)vec_pack((vector unsigned short)vx3,(vector unsigned short)vx6); + + vs1 = (vector signed char)vec_perm(vhlen1,vhlen1,vx); + vs2 = (vector signed char)vec_perm(vhlen2,vhlen2,vx); + + vsum1 = vec_sum4s(vs1,vsum1); + vsum2 = vec_sum4s(vs2,vsum2); + + ix += 32; + } while(ix < end-31); + + vsum1 = vec_sums(vsum1,vzero); + vsum2 = vec_sums(vsum2,vzero); + + vsum1 = vec_perm(vsum1,vsum1,vec_lvsr(4,&sum1)); + vsum2 = vec_perm(vsum2,vsum2,vec_lvsr(4,&sum2)); + vec_ste(vsum1,0,&sum1); + vec_ste(vsum2,0,&sum2); + + while (ix < end) { + unsigned int const x0 = *ix++; + unsigned int const x1 = *ix++; + sum += table[ x0 * xlen + x1 ]; + } + + sum2 += sum & 0xffffu; + sum = (sum>>16u) + sum1; + + goto end; + +normal: + do { + unsigned int const x0 = *ix++; + unsigned int const x1 = *ix++; + sum += table[ x0 * xlen + x1 ]; + } while (ix < end); + + sum2 = sum & 0xffffu; + sum >>= 16u; + +end: + if (sum > sum2) { + sum = sum2; + t1++; + } + + *s += sum; + return t1; +} + +inline static int +count_bit_noESC_from3_altivec1(const int *ix, const int *const end, int max, unsigned int *s) +{ + int t1 = huf_tbl_noESC[max - 1]; + /* No ESC-words */ + unsigned int sum1 = 0; + unsigned int sum2 = 0; + unsigned int sum3 = 0; + const unsigned int xlen = 6; + const uint8_t *const hlen1 = ht[7].hlen; + const uint8_t *const hlen2 = ht[8].hlen; + const uint8_t *const hlen3 = ht[9].hlen; + int t; + vector signed int v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15,v16; + vector signed int vx1,vx2,vx3,vx4,vx5,vx6,vxlen,vzero,vsum1,vsum2,vsum3; + vector unsigned char vmask,vperm1,vperm2,vx,v31; + vector unsigned char vhlen11,vhlen12,vhlen13,vhlen21,vhlen22,vhlen23,vhlen31,vhlen32,vhlen33; + vector signed char vs1,vs2,vs3; + + vhlen11 = (vector unsigned char)VINIT16(1,4,7,9,9,10,4,6,8,9,9,10,7,7,9,10); + vhlen12 = (vector unsigned char)VINIT16(10,11,8,9,10,11,11,11,8,9,10,11,11,12,9,10); + vhlen13 = (vector unsigned char)VINIT16(11,12,12,12,0,0,0,0,0,0,0,0,0,0,0,0); + vhlen21 = (vector unsigned char)VINIT16(2,4,7,9,9,10,4,4,6,10,10,10,7,6,8,10); + vhlen22 = (vector unsigned char)VINIT16(10,11,9,10,10,11,11,12,9,9,10,11,12,12,10,10); + vhlen23 = (vector unsigned char)VINIT16(11,11,13,13,0,0,0,0,0,0,0,0,0,0,0,0); + vhlen31 = (vector unsigned char)VINIT16(3,4,6,7,9,10,4,5,6,7,8,10,5,6,7,8); + vhlen32 = (vector unsigned char)VINIT16(9,10,7,7,8,9,9,10,8,8,9,9,10,11,9,9); + vhlen33 = (vector unsigned char)VINIT16(10,10,11,11,0,0,0,0,0,0,0,0,0,0,0,0); + vperm1 = (vector unsigned char)VINIT16(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); + vperm2 = (vector unsigned char)VINIT16(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); + v31 = (vector unsigned char)VINIT16ALL(31); + vxlen = vec_splat_s32(6); + vzero = vec_xor(vzero,vzero); + vsum1 = vec_xor(vsum1,vsum1); + vsum2 = vec_xor(vsum2,vsum2); + vsum3 = vec_xor(vsum3,vsum3); + + if((int)(end - ix) < 32) goto normal; + //int *end2 = ix + 32*((int)(end - ix)/32); + v0 = vec_ld(0,ix); + vmask = vec_lvsl(0,ix); + do { + v1 = vec_ld(16,ix); + v2 = vec_ld(32,ix); + v3 = vec_ld(48,ix); + v4 = vec_ld(64,ix); + v5 = vec_ld(80,ix); + v6 = vec_ld(96,ix); + v7 = vec_ld(112,ix); + v8 = vec_ld(128,ix); + v9 = vec_perm(v0,v1,vmask); + v10 = vec_perm(v1,v2,vmask); + v11 = vec_perm(v2,v3,vmask); + v12 = vec_perm(v3,v4,vmask); + v13 = vec_perm(v4,v5,vmask); + v14 = vec_perm(v5,v6,vmask); + v15 = vec_perm(v6,v7,vmask); + v16 = vec_perm(v7,v8,vmask); + v0 = v8; + v1 = vec_perm(v9,v10,vperm1); + v2 = vec_perm(v9,v10,vperm2); + v3 = vec_perm(v11,v12,vperm1); + v4 = vec_perm(v11,v12,vperm2); + v5 = vec_perm(v13,v14,vperm1); + v6 = vec_perm(v13,v14,vperm2); + v7 = vec_perm(v15,v16,vperm1); + v8 = vec_perm(v15,v16,vperm2); + vx1 = (vector signed int)vec_mladd((vector unsigned short)v1,(vector unsigned short)vxlen,(vector unsigned short)v2); + vx2 = (vector signed int)vec_mladd((vector unsigned short)v3,(vector unsigned short)vxlen,(vector unsigned short)v4); + vx3 = (vector signed int)vec_pack(vx1,vx2); + vx4 = (vector signed int)vec_mladd((vector unsigned short)v5,(vector unsigned short)vxlen,(vector unsigned short)v6); + vx5 = (vector signed int)vec_mladd((vector unsigned short)v7,(vector unsigned short)vxlen,(vector unsigned short)v8); + vx6 = (vector signed int)vec_pack(vx4,vx5); + vx = (vector unsigned char)vec_pack((vector unsigned short)vx3,(vector unsigned short)vx6); + + v1 = (vector signed int)vec_perm(vhlen11,vhlen12,vx); + v2 = (vector signed int)vec_perm(vhlen13,vhlen13,vx); + v3 = (vector signed int)vec_perm(vhlen21,vhlen22,vx); + v4 = (vector signed int)vec_perm(vhlen23,vhlen23,vx); + v5 = (vector signed int)vec_perm(vhlen31,vhlen32,vx); + v6 = (vector signed int)vec_perm(vhlen33,vhlen33,vx); + v7 = (vector signed int)vec_cmpgt(vx,v31); + vs1 = (vector signed char)vec_sel(v1,v2,(vector unsigned int)v7); + vs2 = (vector signed char)vec_sel(v3,v4,(vector unsigned int)v7); + vs3 = (vector signed char)vec_sel(v5,v6,(vector unsigned int)v7); + + vsum1 = vec_sum4s(vs1,vsum1); + vsum2 = vec_sum4s(vs2,vsum2); + vsum3 = vec_sum4s(vs3,vsum3); + + ix += 32; + } while(ix < end-31); + + vsum1 = vec_sums(vsum1,vzero); + vsum2 = vec_sums(vsum2,vzero); + vsum3 = vec_sums(vsum3,vzero); + + vsum1 = vec_perm(vsum1,vsum1,vec_lvsr(4,&sum1)); + vsum2 = vec_perm(vsum2,vsum2,vec_lvsr(4,&sum2)); + vsum3 = vec_perm(vsum3,vsum3,vec_lvsr(4,&sum3)); + vec_ste(vsum1,0,(signed int *)&sum1); + vec_ste(vsum2,0,(signed int *)&sum2); + vec_ste(vsum3,0,(signed int *)&sum3); + + while (ix < end) { + int x = ix[0] * xlen + ix[1]; + ix += 2; + sum1 += hlen1[x]; + sum2 += hlen2[x]; + sum3 += hlen3[x]; + } + goto end; + + normal: + + do { + int x = ix[0] * xlen + ix[1]; + ix += 2; + sum1 += hlen1[x]; + sum2 += hlen2[x]; + sum3 += hlen3[x]; + } while (ix < end); + + end: + + t = t1; + if (sum1 > sum2) { + sum1 = sum2; + t++; + } + if (sum1 > sum3) { + sum1 = sum3; + t = t1+2; + } + *s += sum1; + + return t; +} + +inline static int +count_bit_noESC_from3_altivec2(const int *ix, const int *const end, int max, unsigned int *s) +{ + int t1 = huf_tbl_noESC[max - 1]; + /* No ESC-words */ + unsigned int sum1 = 0; + unsigned int sum2 = 0; + unsigned int sum3 = 0; + const unsigned int xlen = 8; + const uint8_t *const hlen1 = ht[10].hlen; + const uint8_t *const hlen2 = ht[11].hlen; + const uint8_t *const hlen3 = ht[12].hlen; + int t; + vector signed int v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15,v16; + vector signed int vx1,vx2,vx3,vx4,vx5,vx6,vxlen,vzero,vsum1,vsum2,vsum3; + vector unsigned char vmask,vperm1,vperm2,vx,v31; + vector unsigned char vhlen11,vhlen12,vhlen13,vhlen14,vhlen21,vhlen22,vhlen23,vhlen24,vhlen31,vhlen32,vhlen33,vhlen34; + vector signed char vs1,vs2,vs3; + + vhlen11 = (vector unsigned char)VINIT16( 1, 4, 7, 9, 10, 10, 10, 11, 4, 6, 8, 9, 10, 11, 10, 10); + vhlen12 = (vector unsigned char)VINIT16( 7, 8, 9, 10, 11, 12, 11, 11, 8, 9, 10, 11, 12, 12, 11, 12); + vhlen13 = (vector unsigned char)VINIT16( 9, 10, 11, 12, 12, 12, 12, 12,10, 11, 12, 12, 13, 13, 12, 13); + vhlen14 = (vector unsigned char)VINIT16( 9, 10, 11, 12, 12, 12, 13, 13,10, 10, 11, 12, 12, 13, 13, 13); + vhlen21 = (vector unsigned char)VINIT16( 2, 4, 6, 8, 9, 10, 9, 10, 4, 5, 6, 8, 10, 10, 9, 10); + vhlen22 = (vector unsigned char)VINIT16( 6, 7, 8, 9, 10, 11, 10, 10, 8, 8, 9, 11, 10, 12, 10, 11); + vhlen23 = (vector unsigned char)VINIT16( 9, 10, 10, 11, 11, 12, 11, 12, 9, 10, 11, 12, 12, 13, 12, 13); + vhlen24 = (vector unsigned char)VINIT16( 9, 9, 9, 10, 11, 12, 12, 12, 9, 9, 10, 11, 12, 12, 12, 12); + vhlen31 = (vector unsigned char)VINIT16( 4, 4, 6, 8, 9, 10, 10, 10, 4, 5, 6, 7, 9, 9, 10, 10); + vhlen32 = (vector unsigned char)VINIT16( 6, 6, 7, 8, 9, 10, 9, 10, 7, 7, 8, 8, 9, 10, 10, 10); + vhlen33 = (vector unsigned char)VINIT16( 8, 8, 9, 9, 10, 10, 10, 11, 9, 9, 10, 10, 10, 11, 10, 11); + vhlen34 = (vector unsigned char)VINIT16( 9, 9, 9, 10, 10, 11, 11, 12,10, 10, 10, 11, 11, 11, 11, 12); + vperm1 = (vector unsigned char)VINIT16(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); + vperm2 = (vector unsigned char)VINIT16(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); + v31 = (vector unsigned char)VINIT16ALL(31); + vxlen = vec_splat_s32(8); + vzero = vec_xor(vzero,vzero); + vsum1 = vec_xor(vsum1,vsum1); + vsum2 = vec_xor(vsum2,vsum2); + vsum3 = vec_xor(vsum3,vsum3); + + if((int)(end - ix) < 32) goto normal; + //int *end2 = ix + 32*((int)(end - ix)/32); + v0 = vec_ld(0,ix); + vmask = vec_lvsl(0,ix); + do { + v1 = vec_ld(16,ix); + v2 = vec_ld(32,ix); + v3 = vec_ld(48,ix); + v4 = vec_ld(64,ix); + v5 = vec_ld(80,ix); + v6 = vec_ld(96,ix); + v7 = vec_ld(112,ix); + v8 = vec_ld(128,ix); + v9 = vec_perm(v0,v1,vmask); + v10 = vec_perm(v1,v2,vmask); + v11 = vec_perm(v2,v3,vmask); + v12 = vec_perm(v3,v4,vmask); + v13 = vec_perm(v4,v5,vmask); + v14 = vec_perm(v5,v6,vmask); + v15 = vec_perm(v6,v7,vmask); + v16 = vec_perm(v7,v8,vmask); + v0 = v8; + v1 = vec_perm(v9,v10,vperm1); + v2 = vec_perm(v9,v10,vperm2); + v3 = vec_perm(v11,v12,vperm1); + v4 = vec_perm(v11,v12,vperm2); + v5 = vec_perm(v13,v14,vperm1); + v6 = vec_perm(v13,v14,vperm2); + v7 = vec_perm(v15,v16,vperm1); + v8 = vec_perm(v15,v16,vperm2); + + vx1 = (vector signed int)vec_mladd((vector unsigned short)v1,(vector unsigned short)vxlen,(vector unsigned short)v2); + vx2 = (vector signed int)vec_mladd((vector unsigned short)v3,(vector unsigned short)vxlen,(vector unsigned short)v4); + vx3 = (vector signed int)vec_pack(vx1,vx2); + vx4 = (vector signed int)vec_mladd((vector unsigned short)v5,(vector unsigned short)vxlen,(vector unsigned short)v6); + vx5 = (vector signed int)vec_mladd((vector unsigned short)v7,(vector unsigned short)vxlen,(vector unsigned short)v8); + vx6 = (vector signed int)vec_pack(vx4,vx5); + vx = (vector unsigned char)vec_pack((vector unsigned short)vx3,(vector unsigned short)vx6); + + v1 = (vector signed int)vec_perm(vhlen11,vhlen12,vx); + v2 = (vector signed int)vec_perm(vhlen13,vhlen14,vx); + v3 = (vector signed int)vec_perm(vhlen21,vhlen22,vx); + v4 = (vector signed int)vec_perm(vhlen23,vhlen24,vx); + v5 = (vector signed int)vec_perm(vhlen31,vhlen32,vx); + v6 = (vector signed int)vec_perm(vhlen33,vhlen34,vx); + v7 = (vector signed int)vec_cmpgt(vx,v31); + vs1 = (vector signed char)vec_sel(v1,v2,(vector unsigned int)v7); + vs2 = (vector signed char)vec_sel(v3,v4,(vector unsigned int)v7); + vs3 = (vector signed char)vec_sel(v5,v6,(vector unsigned int)v7); + + vsum1 = vec_sum4s(vs1,vsum1); + vsum2 = vec_sum4s(vs2,vsum2); + vsum3 = vec_sum4s(vs3,vsum3); + + ix += 32; + } while(ix < end-31); + + vsum1 = vec_sums(vsum1,vzero); + vsum2 = vec_sums(vsum2,vzero); + vsum3 = vec_sums(vsum3,vzero); + + vsum1 = vec_perm(vsum1,vsum1,vec_lvsr(4,&sum1)); + vsum2 = vec_perm(vsum2,vsum2,vec_lvsr(4,&sum2)); + vsum3 = vec_perm(vsum3,vsum3,vec_lvsr(4,&sum3)); + vec_ste(vsum1,0,(signed int *)&sum1); + vec_ste(vsum2,0,(signed int *)&sum2); + vec_ste(vsum3,0,(signed int *)&sum3); + + while (ix < end) { + int x = ix[0] * xlen + ix[1]; + ix += 2; + sum1 += hlen1[x]; + sum2 += hlen2[x]; + sum3 += hlen3[x]; + } + goto end; + + normal: + + do { + int x = ix[0] * xlen + ix[1]; + ix += 2; + sum1 += hlen1[x]; + sum2 += hlen2[x]; + sum3 += hlen3[x]; + } while (ix < end); + + end: + + t = t1; + if (sum1 > sum2) { + sum1 = sum2; + t++; + } + if (sum1 > sum3) { + sum1 = sum3; + t = t1+2; + } + *s += sum1; + + return t; +} +#endif /*************************************************************************/ /* choose table */ @@ -599,12 +1449,21 @@ typedef int (*count_fnc)(const int* ix, const int* end, int max, unsigned int* s static const count_fnc count_fncs[] = { &count_bit_null , &count_bit_noESC +#if __ALTIVEC__ +, &count_bit_noESC_from2_altivec1 +, &count_bit_noESC_from2_altivec2 +, &count_bit_noESC_from3_altivec1 +, &count_bit_noESC_from3_altivec1 +, &count_bit_noESC_from3_altivec2 +, &count_bit_noESC_from3_altivec2 +#else , &count_bit_noESC_from2 , &count_bit_noESC_from2 , &count_bit_noESC_from3 , &count_bit_noESC_from3 , &count_bit_noESC_from3 , &count_bit_noESC_from3 +#endif , &count_bit_noESC_from3 , &count_bit_noESC_from3 , &count_bit_noESC_from3 @@ -621,7 +1480,11 @@ choose_table_nonMMX(const int *ix, const int *const end, int *const _s) unsigned int* s = (unsigned int*)_s; unsigned int max; int choice, choice2; +#if __ALTIVEC__ + max = ix_max_vec(ix, end); +#else max = ix_max(ix, end); +#endif if (max <= 15) { return count_fncs[max](ix, end, max, s); @@ -643,7 +1506,11 @@ choose_table_nonMMX(const int *ix, const int *const end, int *const _s) break; } } +#if defined(__ALTIVEC__) && defined(_ARCH_PPC64) + return count_bit_ESC_altivec(ix, end, choice, choice2, s); +#else return count_bit_ESC(ix, end, choice, choice2, s); +#endif } diff --git libmp3lame/util.c libmp3lame/util.c index 43b457c..ea4b204 100644 --- libmp3lame/util.c +++ libmp3lame/util.c @@ -26,6 +26,10 @@ # include #endif +#if defined(__ALTIVEC__) && !defined(_ARCH_PPC64) +#include +#endif + #include #include "lame.h" #include "machine.h" @@ -954,6 +958,108 @@ disable_FPE(void) * ***********************************************************************/ +#if defined(__ALTIVEC__) && !defined(_ARCH_PPC64) + +inline ieee754_float32_t fast_log10_altivec(ieee754_float32_t x) +{ + vector float va,vb,vc,vhalf,vzero,vsqrt2,vconst4; + vector float v1,v2,v3,v4,v5,v6,v7,v8,vz,vz2,vlog; + vector unsigned int vconst1,vconst2,vshamt; + vector signed int vconst3; + float out __attribute__ ((aligned (16))); + + va = (vector float)VINIT4ALL(0.8685890659); + vb = (vector float)VINIT4ALL(0.2894672153); + vc = (vector float)VINIT4ALL(0.1793365895); + vhalf = (vector float)VINIT4ALL(0.15051499783); + vsqrt2 = (vector float)VINIT4ALL(1.4142135623731); + vconst4 = (vector float)VINIT4ALL(0.301029995664); + vzero = vec_xor(vzero,vzero); + vconst1 = (vector unsigned int)vec_sr(vec_splat_s32(-1),vec_splat_u32(9)); + vconst2 = (vector unsigned int)vec_sr(vec_splat_s32(-1),vec_splat_u32(7)); + vconst2 = vec_nor(vconst2,vconst2); + vconst3 = (vector signed int)vec_rl(vconst2,vec_splat_u32(7)); + vshamt = vec_add(vec_splat_u32(9),vec_splat_u32(7)); + vshamt = vec_add(vshamt,vec_splat_u32(7)); + vconst2 = vec_sl((vector unsigned int)vconst3,vshamt); + + v1 = vec_ld(0,&x); + v2 = vec_perm(v1,v1,vec_lvsl(0,&x)); + v3 = vec_splat(v2,0); + + v4 = (vector float)vec_sel(vconst2,(vector unsigned int)v3,vconst1); + v5 = vec_add(v4,vsqrt2); + v6 = vec_sub(v4,vsqrt2); + v7 = vec_re(v5); + vz = vec_madd(v6, vec_madd(vec_nmsub(v7,v5,(vector float)vconst2),v7,v7), vzero); + v8 = (vector float)vec_sr((vector unsigned int)v3,vshamt); + vlog = vec_ctf(vec_sub((vector signed int)v8,vconst3),0); + + vz2 = vec_madd(vz,vz,vzero); + vlog = vec_madd(vlog,vconst4,vhalf); + + v1 = vec_madd(vz2,vc,vb); + v2 = vec_madd(vz2,v1,va); + vlog = vec_madd(vz,v2,vlog); + + vec_ste(vlog,0,&out); + + return out; +} + +inline ieee754_float32_t fast_loge_altivec(ieee754_float32_t x) +{ + vector float va,vb,vc,vhalf,vzero,vsqrt2,vconst4; + vector float v1,v2,v3,v4,v5,v6,v7,v8,vz,vz2,vlog; + vector unsigned int vconst1,vconst2,vshamt; + vector signed int vconst3; + float out __attribute__ ((aligned (16))); + + va = (vector float)VINIT4ALL(2.0000006209); + vb = (vector float)VINIT4ALL(0.6664778517); + vc = (vector float)VINIT4ALL(0.4139745860); + vhalf = (vector float)VINIT4ALL(0.34657359028); + vsqrt2 = (vector float)VINIT4ALL(1.4142135623731); + vconst4 = (vector float)VINIT4ALL(0.6931471805599); + vzero = vec_xor(vzero,vzero); + vconst1 = (vector unsigned int)vec_sr(vec_splat_s32(-1),vec_splat_u32(9)); + vconst2 = (vector unsigned int)vec_sr(vec_splat_s32(-1),vec_splat_u32(7)); + vconst2 = vec_nor(vconst2,vconst2); + vconst3 = (vector signed int)vec_rl(vconst2,vec_splat_u32(7)); + vshamt = vec_add(vec_splat_u32(9),vec_splat_u32(7)); + vshamt = vec_add(vshamt,vec_splat_u32(7)); + vconst2 = vec_sl((vector unsigned int)vconst3,vshamt); + + v1 = vec_ld(0,&x); + v2 = vec_perm(v1,v1,vec_lvsl(0,&x)); + v3 = vec_splat(v2,0); + + v4 = (vector float)vec_sel(vconst2,(vector unsigned int)v3,vconst1); + v5 = vec_add(v4,vsqrt2); + v6 = vec_sub(v4,vsqrt2); + v7 = vec_re(v5); + vz = vec_madd(v6, vec_madd(vec_nmsub(v7,v5,(vector float)vconst2),v7,v7), vzero); + v8 = (vector float)vec_sr((vector unsigned int)v3,vshamt); + vlog = vec_ctf(vec_sub((vector signed int)v8,vconst3),0); + + vz2 = vec_madd(vz,vz,vzero); + vlog = vec_madd(vlog,vconst4,vhalf); + + v1 = vec_madd(vz2,vc,vb); + v2 = vec_madd(vz2,v1,va); + vlog = vec_madd(vz,v2,vlog); + + vec_ste(vlog,0,&out); + + return out; +} + +void +init_log_table(void) +{ +} + +#else #define LOG2_SIZE (512) #define LOG2_SIZE_L2 (9) @@ -1004,6 +1110,8 @@ fast_log2(ieee754_float32_t x) return log2val; } +#endif + #else /* Don't use FAST_LOG */ diff --git libmp3lame/util.h libmp3lame/util.h index 13f0cd4..a0b3b55 100644 --- libmp3lame/util.h +++ libmp3lame/util.h @@ -93,10 +93,17 @@ extern "C" { /* log/log10 approximations */ #ifdef USE_FAST_LOG +#if defined(__ALTIVEC__) && !defined(_ARCH_PPC64) +#define FAST_LOG10(x) (fast_log10_altivec(x)) +#define FAST_LOG(x) (fast_loge_altivec(x)) +#define FAST_LOG10_X(x,y) (fast_log10_altivec(x)*(y)) +#define FAST_LOG_X(x,y) (fast_loge_altivec(x)*(y)) +#else #define FAST_LOG10(x) (fast_log2(x)*(LOG2/LOG10)) #define FAST_LOG(x) (fast_log2(x)*LOG2) #define FAST_LOG10_X(x,y) (fast_log2(x)*(LOG2/LOG10*(y))) #define FAST_LOG_X(x,y) (fast_log2(x)*(LOG2*(y))) +#endif #else #define FAST_LOG10(x) log10(x) #define FAST_LOG(x) log(x) @@ -186,14 +193,14 @@ extern "C" { */ typedef struct { - FLOAT masking_lower[CBANDS]; + FLOAT masking_lower[CBANDS] __attribute__ ((aligned (16))); FLOAT minval[CBANDS]; FLOAT rnumlines[CBANDS]; FLOAT mld_cb[CBANDS]; FLOAT mld[Max(SBMAX_l,SBMAX_s)]; FLOAT bo_weight[Max(SBMAX_l,SBMAX_s)]; /* band weight long scalefactor bands, at transition */ FLOAT attack_threshold; /* short block tuning */ - int s3ind[CBANDS][2]; + int s3ind[CBANDS][4] __attribute__ ((aligned (16))); int numlines[CBANDS]; int bm[Max(SBMAX_l,SBMAX_s)]; int bo[Max(SBMAX_l,SBMAX_s)]; @@ -219,7 +226,7 @@ extern "C" { typedef struct { - FLOAT nb_l1[4][CBANDS], nb_l2[4][CBANDS]; + FLOAT nb_l1[4][CBANDS] __attribute__ ((aligned (16))), nb_l2[4][CBANDS] __attribute__ ((aligned (16))); FLOAT nb_s1[4][CBANDS], nb_s2[4][CBANDS]; III_psy_xmin thm[4]; @@ -246,7 +253,7 @@ extern "C" { /* variables used by encoder.c */ typedef struct { /* variables for newmdct.c */ - FLOAT sb_sample[2][2][18][SBLIMIT]; + FLOAT sb_sample[2][2][18][SBLIMIT] __attribute__ ((aligned (16))); FLOAT amp_filter[32]; /* variables used by util.c */ @@ -293,7 +300,7 @@ extern "C" { #ifndef MFSIZE # define MFSIZE ( 3*1152 + ENCDELAY - MDCTDELAY ) #endif - sample_t mfbuf[2][MFSIZE]; + sample_t mfbuf[2][MFSIZE] __attribute__ ((aligned (16))); int mf_samples_to_encode; int mf_size; @@ -567,7 +574,12 @@ extern "C" { /* log/log10 approximations */ extern void init_log_table(void); +#if defined(__ALTIVEC__) && !defined(_ARCH_PPC64) + extern ieee754_float32_t fast_log10_altivec(ieee754_float32_t x); + extern ieee754_float32_t fast_loge_altivec(ieee754_float32_t x); +#else extern ieee754_float32_t fast_log2(ieee754_float32_t x); +#endif int isResamplingNecessary(SessionConfig_t const* cfg); diff --git libmp3lame/vbrquantize.c libmp3lame/vbrquantize.c index 0f703b7..67029c4 100644 --- libmp3lame/vbrquantize.c +++ libmp3lame/vbrquantize.c @@ -26,6 +26,10 @@ # include #endif +#if __ALTIVEC__ +#undef TAKEHIRO_IEEE754_HACK +#include +#endif #include "lame.h" #include "machine.h" @@ -217,8 +221,23 @@ k_34_4(DOUBLEX x[4], int l3[4]) static FLOAT calc_sfb_noise_x34(const FLOAT * xr, const FLOAT * xr34, unsigned int bw, uint8_t sf) { +#if __ALTIVEC__ + float vpow[8] __attribute__ ((aligned (16))); + vector float v0, v1, v2, v3, v4, v5, v6,v7,v8,v9,v10,v11,v12,v13; + vector unsigned char vperm1, vperm2,vc1,vc2,vc3; + vector signed int vl1,vl2,vl3; + vector float vxfsf, vsfpow, vsfpow34, vabs, vzero; + unsigned int s1,s2,s3,s4,s5,s6,s7,s8; + const vector float const1 = (vector float)VINIT4(0.4053964553387788,3.404263724373839,5.465086767819913,1.0); + const vector float const2 = (vector float)VINIT4(7.719205369637751,10.93017829043677,0,0); +#if _ARCH_PPC64 + vector unsigned int vmask1,vmask2,vmask3; +#else + vector unsigned char vperm3,vperm4,vc4,vc5,vc6,vmask; +#endif +#endif DOUBLEX x[4]; - int l3[4]; + int l3[4] __attribute__ ((aligned (16))); const FLOAT sfpow = pow20[sf + Q_MAX2]; /*pow(2.0,sf/4.0); */ const FLOAT sfpow34 = ipow20[sf]; /*pow(sfpow,-3.0/4.0); */ @@ -226,6 +245,239 @@ calc_sfb_noise_x34(const FLOAT * xr, const FLOAT * xr34, unsigned int bw, uint8_ unsigned int i = bw >> 2u; unsigned int const remaining = (bw & 0x03u); +#if __ALTIVEC__ + vpow[0] = sfpow; + vpow[1] = sfpow34; + vsfpow = vec_ld(0,vpow); + vxfsf = vec_xor(vxfsf,vxfsf); + vsfpow34 = vec_splat(vsfpow,1); + vsfpow = vec_splat(vsfpow,0); + vperm1 = vec_lvsl(0,xr); + vperm2 = vec_lvsl(0,xr34); + v0 = vec_ld(0,xr); + v1 = vec_ld(0,xr34); + vabs = (vector float)vec_splat_s32(-1); + vabs = (vector float)vec_sl((vector unsigned int)vabs, (vector unsigned int)vabs); + vzero = vec_xor(vzero,vzero); +#if _ARCH_PPC64 + vc1 = vec_splat_u8(1); + vc2 = vec_splat_u8(5); + vc3 = vec_sl(vc1,vc2); + vmask1 = (vector unsigned int)vec_splat_s32(-1); + vmask1 = vec_sro(vmask1,vc3); + vmask2 = vec_sro(vmask1,vc3); + vmask3 = vec_sro(vmask2,vc3); +#else + vperm3 = (vector unsigned char)VINIT16(0,0,0,0,0,0,0,0,0,1,2,3,16,17,18,19); + vperm4 = vec_sld(vperm3,(vector unsigned char)vzero,8); + vmask = (vector unsigned char)VINIT16ALL(16); +#endif + for (; i > 1; i -= 2) { + + v2 = vec_ld(16,xr34); + v3 = vec_ld(32,xr34); + v4 = vec_perm(v1,v2,vperm2); + v5 = vec_perm(v2,v3,vperm2); + v12 = vec_madd(v4,vsfpow34,vzero); + v13 = vec_madd(v5,vsfpow34,vzero); + v1 = v3; + + v2 = vec_floor(v12); + v3 = vec_floor(v13); + v4 = vec_splat(const1,2); + v5 = vec_splat(const1,1); + v6 = vec_splat(const2,1); + v7 = vec_splat(const2,0); + v8 = vec_madd(v2,v4,v5); + v9 = vec_madd(v3,v4,v5); + v10 = vec_madd(v2,v6,v7); + v11 = vec_madd(v3,v6,v7); + v4 = vec_splat(const1,0); + v5 = vec_splat(const1,3); + v8 = vec_madd(v8,v2,v4); + v9 = vec_madd(v9,v3,v4); + v10 = vec_madd(v10,v2,v5); + v11 = vec_madd(v11,v3,v5); + v6 = vec_re(v10); + v7 = vec_re(v11); + v10 = vec_nmsub(v10,v6,v5); + v11 = vec_nmsub(v11,v7,v5); + v10 = vec_madd(v10,v6,v6); + v11 = vec_madd(v11,v7,v7); + v10 = vec_madd(v8,v10,v12); + v11 = vec_madd(v9,v11,v13); + + vl1 = vec_cts(v10,0); + vl2 = vec_cts(v11,0); + vl3 = (vector signed int)vec_pack(vl1,vl2); + vec_st(vl3,0,l3); + + s1 = l3[0] >> 16; + s2 = l3[0] & 0xffff; + s3 = l3[1] >> 16; + s4 = l3[1] & 0xffff; + s5 = l3[2] >> 16; + s6 = l3[2] & 0xffff; + s7 = l3[3] >> 16; + s8 = l3[3] & 0xffff; + +#if _ARCH_PPC64 + v2 = vec_lde(0,pow43+s1); + v3 = vec_lde(0,pow43+s2); + v4 = vec_lde(0,pow43+s3); + v5 = vec_lde(0,pow43+s4); + v2 = vec_perm(v2,v2,vec_lvsl(0,pow43+s1)); + v3 = vec_perm(v3,v3,vec_lvsl(-4,pow43+s2)); + v4 = vec_perm(v4,v4,vec_lvsl(-8,pow43+s3)); + v5 = vec_perm(v5,v5,vec_lvsl(-12,pow43+s4)); + v12 = vec_sel(v2,v3,vmask1); + v12 = vec_sel(v12,v4,vmask2); + v12 = vec_sel(v12,v5,vmask3); + + v2 = vec_lde(0,pow43+s5); + v3 = vec_lde(0,pow43+s6); + v4 = vec_lde(0,pow43+s7); + v5 = vec_lde(0,pow43+s8); + v2 = vec_perm(v2,v2,vec_lvsl(0,pow43+s5)); + v3 = vec_perm(v3,v3,vec_lvsl(-4,pow43+s6)); + v4 = vec_perm(v4,v4,vec_lvsl(-8,pow43+s7)); + v5 = vec_perm(v5,v5,vec_lvsl(-12,pow43+s8)); + v13 = vec_sel(v2,v3,vmask1); + v13 = vec_sel(v13,v4,vmask2); + v13 = vec_sel(v13,v5,vmask3); +#else + vc1 = vec_lvsl(0,pow43+s1); + vc2 = vec_lvsl(0,pow43+s2); + vc3 = vec_lvsl(0,pow43+s3); + vc4 = vec_lvsl(0,pow43+s4); + vc2 = vec_or(vc2,vmask); + vc4 = vec_or(vc4,vmask); + v2 = vec_lde(0,pow43+s1); + v3 = vec_lde(0,pow43+s2); + v4 = vec_lde(0,pow43+s3); + v5 = vec_lde(0,pow43+s4); + vc5 = vec_perm(vc1,vc2,vperm3); + vc6 = vec_perm(vc3,vc4,vperm4); + v6 = vec_perm(v2,v3,vc5); + v7 = vec_perm(v4,v5,vc6); + v12 = vec_sld(v6,v7,8); + + vc1 = vec_lvsl(0,pow43+s5); + vc2 = vec_lvsl(0,pow43+s6); + vc3 = vec_lvsl(0,pow43+s7); + vc4 = vec_lvsl(0,pow43+s8); + vc2 = vec_or(vc2,vmask); + vc4 = vec_or(vc4,vmask); + v2 = vec_lde(0,pow43+s5); + v3 = vec_lde(0,pow43+s6); + v4 = vec_lde(0,pow43+s7); + v5 = vec_lde(0,pow43+s8); + vc5 = vec_perm(vc1,vc2,vperm3); + vc6 = vec_perm(vc3,vc4,vperm4); + v6 = vec_perm(v2,v3,vc5); + v7 = vec_perm(v4,v5,vc6); + v13 = vec_sld(v6,v7,8); +#endif + + v2 = vec_ld(16, xr); + v3 = vec_ld(32, xr); + v6 = vec_perm(v0,v2,vperm1); + v7 = vec_perm(v2,v3,vperm1); + v0 = v3; + v8 = vec_andc(v6,vabs); + v9 = vec_andc(v7,vabs); + v10 = vec_nmsub(vsfpow, v12, v8); + v11 = vec_nmsub(vsfpow, v13, v9); + vxfsf = vec_madd(v10, v10, vxfsf); + vxfsf = vec_madd(v11, v11, vxfsf); + + xr += 8; + xr34 += 8; + } + if (i) { +#if _ARCH_PPC64 + x[0] = sfpow34 * xr34[0]; + x[1] = sfpow34 * xr34[1]; + x[2] = sfpow34 * xr34[2]; + x[3] = sfpow34 * xr34[3]; + + k_34_4(x, l3); + + vpow[0] = pow43[l3[0]]; + vpow[1] = pow43[l3[1]]; + vpow[2] = pow43[l3[2]]; + vpow[3] = pow43[l3[3]]; + v1 = vec_ld(0, vpow); + v2 = vec_ld(16, xr); + v3 = vec_perm(v0,v2,vperm1); + v4 = vec_andc(v3,vabs); + v5 = vec_nmsub(vsfpow, v1, v4); + vxfsf = vec_madd(v5, v5, vxfsf); +#else + v2 = vec_ld(16,xr34); + v3 = vec_perm(v1,v2,vperm2); + v4 = vec_madd(v3,vsfpow34,vzero); + vl1 = vec_cts(v4,0); + vec_st(vl1,0,l3); + + v5 = vec_lde(0,adj43+l3[0]); + v6 = vec_lde(0,adj43+l3[1]); + v7 = vec_lde(0,adj43+l3[2]); + v8 = vec_lde(0,adj43+l3[3]); + v9 = vec_perm(v5,v5,vec_lvsl(0,adj43+l3[0])); + v10 = vec_perm(v6,v6,vec_lvsl(-4,adj43+l3[1])); + v11 = vec_perm(v7,v7,vec_lvsl(-8,adj43+l3[2])); + v12 = vec_perm(v8,v8,vec_lvsl(-12,adj43+l3[3])); + v9 = vec_or(v9,v10); + v9 = vec_or(v9,v11); + v9 = vec_or(v9,v12); + + v10 = vec_add(v4,v9); + vl1 = vec_cts(v10,0); + vec_st(vl1,0,l3); + + v2 = vec_lde(0,pow43+l3[0]); + v3 = vec_lde(0,pow43+l3[1]); + v4 = vec_lde(0,pow43+l3[2]); + v5 = vec_lde(0,pow43+l3[3]); + v6 = vec_perm(v2,v2,vec_lvsl(0,pow43+l3[0])); + v7 = vec_perm(v3,v3,vec_lvsl(-4,pow43+l3[1])); + v8 = vec_perm(v4,v4,vec_lvsl(-8,pow43+l3[2])); + v9 = vec_perm(v5,v5,vec_lvsl(-12,pow43+l3[3])); + v6 = vec_or(v6,v7); + v6 = vec_or(v6,v8); + v6 = vec_or(v6,v9); + + v2 = vec_ld(16, xr); + v3 = vec_perm(v0,v2,vperm1); + v4 = vec_andc(v3,vabs); + v5 = vec_nmsub(vsfpow, v6, v4); + vxfsf = vec_madd(v5, v5, vxfsf); +#endif + xr += 4; + xr34 += 4; + } + if (remaining) { + x[0] = x[1] = x[2] = x[3] = 0; + switch( remaining ) { + case 3: x[2] = sfpow34 * xr34[2]; + case 2: x[1] = sfpow34 * xr34[1]; + case 1: x[0] = sfpow34 * xr34[0]; + } + + k_34_4(x, l3); + x[0] = x[1] = x[2] = x[3] = 0; + + switch( remaining ) { + case 3: x[2] = fabsf(xr[2]) - sfpow * pow43[l3[2]]; + case 2: x[1] = fabsf(xr[1]) - sfpow * pow43[l3[1]]; + case 1: x[0] = fabsf(xr[0]) - sfpow * pow43[l3[0]]; + } + xfsf += (x[0] * x[0] + x[1] * x[1]) + (x[2] * x[2] + x[3] * x[3]); + } + vec_st(vxfsf,0,vpow); + return xfsf + vpow[0] + vpow[1] + vpow[2] + vpow[3]; +#else while (i-- > 0) { x[0] = sfpow34 * xr34[0]; x[1] = sfpow34 * xr34[1]; @@ -262,6 +514,7 @@ calc_sfb_noise_x34(const FLOAT * xr, const FLOAT * xr34, unsigned int bw, uint8_ xfsf += (x[0] * x[0] + x[1] * x[1]) + (x[2] * x[2] + x[3] * x[3]); } return xfsf; +#endif } diff --git libmp3lame/version.h libmp3lame/version.h index f5fef50..69edd42 100644 --- libmp3lame/version.h +++ libmp3lame/version.h @@ -31,13 +31,20 @@ # define STR(x) __STR(x) #endif -# define LAME_URL "http://lame.sf.net" - +#if __ALTIVEC__ +#if _ARCH_PPC64 +# define LAME_URL "+VMX+970 http://www.floodgap.com/software/lamevmx/" +#else +# define LAME_URL "+VMX http://www.floodgap.com/software/lamevmx/" +#endif +#else +# define LAME_URL "G3 http://www.floodgap.com/software/lamevmx/" +#endif # define LAME_MAJOR_VERSION 3 /* Major version number */ # define LAME_MINOR_VERSION 100 /* Minor version number */ # define LAME_TYPE_VERSION 2 /* 0:alpha 1:beta 2:release */ -# define LAME_PATCH_VERSION 0 /* Patch level */ +# define LAME_PATCH_VERSION 1 /* Patch level */ # define LAME_ALPHA_VERSION (LAME_TYPE_VERSION==0) # define LAME_BETA_VERSION (LAME_TYPE_VERSION==1) # define LAME_RELEASE_VERSION (LAME_TYPE_VERSION==2) diff --git ltmain.sh ltmain.sh index 0f0a2da..acc8382 100644 --- ltmain.sh +++ ltmain.sh @@ -5103,7 +5103,7 @@ func_extract_an_archive () if ($AR t "$f_ex_an_ar_oldlib" | sort | sort -uc >/dev/null 2>&1); then : else - func_fatal_error "object name conflicts in archive: $f_ex_an_ar_dir/$f_ex_an_ar_oldlib" +: # func_fatal_error "object name conflicts in archive: $f_ex_an_ar_dir/$f_ex_an_ar_oldlib" fi } diff --git mpglib/Makefile.in mpglib/Makefile.in index edc519f..7284783 100644 --- mpglib/Makefile.in +++ mpglib/Makefile.in @@ -426,22 +426,22 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tabinit.Plo@am__quote@ .c.o: -@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< -@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -c -o $@ $< +#@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po @AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ $< .c.obj: -@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'` -@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'` +#@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po @AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'` .c.lo: -@am__fastdepCC_TRUE@ $(AM_V_CC)$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< -@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo +@am__fastdepCC_TRUE@ $(AM_V_CC)$(LTCOMPILE) -c -o $@ $< +#@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo @AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LTCOMPILE) -c -o $@ $<