From ff8aca6fe38933d063cae3cbda1de5ce69424aeb Mon Sep 17 00:00:00 2001 From: Giovanni Mariani Date: Mon, 5 Mar 2018 12:13:56 +0100 Subject: [PATCH 01/32] Updated to release 1.5.3 and added P0 to fix CVE-2017-15232 --- .abf.yml | 2 +- libjpeg-turbo-1.5.2-CVE-2017-15232.patch | 53 ++++++++++++++++++++++++ libjpeg-turbo.spec | 29 ++++++++----- 3 files changed, 72 insertions(+), 12 deletions(-) create mode 100644 libjpeg-turbo-1.5.2-CVE-2017-15232.patch diff --git a/.abf.yml b/.abf.yml index 08f063d..ce19d84 100644 --- a/.abf.yml +++ b/.abf.yml @@ -1,2 +1,2 @@ sources: - libjpeg-turbo-1.5.0.tar.gz: 9adc21b927e48e4c6889e77079f6c1f3eecf98ab + libjpeg-turbo-1.5.3.tar.gz: 87ebf4cab2bb27fcb8e7ccb18ec4eb680e1f2c2d diff --git a/libjpeg-turbo-1.5.2-CVE-2017-15232.patch b/libjpeg-turbo-1.5.2-CVE-2017-15232.patch new file mode 100644 index 0000000..f67b1bf --- /dev/null +++ b/libjpeg-turbo-1.5.2-CVE-2017-15232.patch @@ -0,0 +1,53 @@ +From 1ecd9a5729d78518397889a630e3534bd9d963a8 Mon Sep 17 00:00:00 2001 +From: Kornel +Date: Sat, 30 Sep 2017 12:05:53 +0100 +Subject: [PATCH] Handle NULL buffer when discarding rows + +--- + jdpostct.c | 5 +++++ + jquant1.c | 4 ++++ + 2 files changed, 9 insertions(+) + +diff --git a/jdpostct.c b/jdpostct.c +index 601fc2a79..a24202ca9 100644 +--- a/jdpostct.c ++++ b/jdpostct.c +@@ -132,6 +132,11 @@ post_process_1pass (j_decompress_ptr cinfo, + my_post_ptr post = (my_post_ptr) cinfo->post; + JDIMENSION num_rows, max_rows; + ++ /* read_and_discard_scanlines may call it with rows "available", but no buffer */ ++ if (output_buf == NULL) { ++ return; ++ } ++ + /* Fill the buffer, but not more than what we can dump out in one go. */ + /* Note we rely on the upsampler to detect bottom of image. */ + max_rows = out_rows_avail - *out_row_ctr; +diff --git a/jquant1.c b/jquant1.c +index e7814815e..ba2ea9b80 100644 +--- a/jquant1.c ++++ b/jquant1.c +@@ -531,6 +531,10 @@ quantize_ord_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf, + JDIMENSION col; + JDIMENSION width = cinfo->output_width; + ++ if (output_buf == NULL && num_rows) { ++ ERREXIT(cinfo, JERR_BAD_PARAM); ++ } ++ + for (row = 0; row < num_rows; row++) { + /* Initialize output values to 0 so can process components separately */ + jzero_far((void *) output_buf[row], (size_t) (width * sizeof(JSAMPLE))); +Index: libjpeg-turbo-1.5.2/jerror.h +=================================================================== +--- libjpeg-turbo-1.5.2.orig/jerror.h 2017-07-07 22:31:10.000000000 +0200 ++++ libjpeg-turbo-1.5.2/jerror.h 2017-10-12 13:24:01.349954012 +0200 +@@ -208,6 +208,7 @@ JMESSAGE(JERR_NO_ARITH_TABLE, "Arithmeti + JMESSAGE(JWRN_ARITH_BAD_CODE, "Corrupt JPEG data: bad arithmetic code") + #endif + #endif ++JMESSAGE(JERR_BAD_PARAM, "Bogus parameter") + + #ifdef JMAKE_ENUM_LIST + diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec index a9c528a..e2d9ff5 100644 --- a/libjpeg-turbo.spec +++ b/libjpeg-turbo.spec @@ -1,20 +1,20 @@ -%define major 8 -%define libname %mklibname jpeg %{major} -%define devname %mklibname jpeg -d -%define sdevname %mklibname jpeg -d -s +%define major 8 +%define libname %mklibname jpeg %{major} +%define devname %mklibname jpeg -d +%define sdevname %mklibname jpeg -d -s -%define majorturbo 0 -%define libturbo %mklibname turbojpeg %{majorturbo} +%define majorturbo 0 +%define libturbo %mklibname turbojpeg %{majorturbo} -%define major62 62 -%define libname62 %mklibname jpeg %{major62} +%define major62 62 +%define libname62 %mklibname jpeg %{major62} Summary: A MMX/SSE2 accelerated library for manipulating JPEG image files Name: libjpeg-turbo Epoch: 1 -Version: 1.5.0 -Release: 3 -License: wxWidgets Library License +Version: 1.5.3 +Release: 1 +License: wxWindows Library License Group: System/Libraries Url: http://www.libjpeg-turbo.org Source0: https://sourceforge.net/projects/libjpeg-turbo/files/%{version}/%{name}-%{version}.tar.gz @@ -25,6 +25,7 @@ Source0: https://sourceforge.net/projects/libjpeg-turbo/files/%{version}/%{name} Source2: http://jpegclub.org/jpegexiforient.c Source3: http://jpegclub.org/exifautotran.txt Patch0: jpeg-6b-c++fixes.patch +Patch1: %{name}-1.5.2-CVE-2017-15232.patch BuildRequires: libtool >= 1.4 %ifarch %{ix86} x86_64 BuildRequires: nasm @@ -62,6 +63,7 @@ This package contains the library needed to run programs dynamically linked with libjpeg. %files -n %{libname62} +%doc LICENSE.md %{_libdir}/libjpeg.so.%{major62}* #---------------------------------------------------------------------------- @@ -75,6 +77,7 @@ This package contains the library needed to run programs dynamically linked with libturbojpeg. %files -n %{libturbo} +%doc LICENSE.md %{_libdir}/libturbojpeg.so.%{majorturbo}* #---------------------------------------------------------------------------- @@ -120,6 +123,7 @@ for developing programs which will manipulate JPEG files using the libjpeg library. %files -n %{sdevname} +%doc LICENSE.md %{_libdir}/libjpeg.a %{_libdir}/libturbojpeg.a @@ -154,10 +158,12 @@ have orientation markings in the EXIF data. %prep %setup -q %patch0 -p0 +%patch1 -p1 cp %{SOURCE2} jpegexiforient.c cp %{SOURCE3} exifautotran + %build CONFIGURE_TOP="$PWD" @@ -182,6 +188,7 @@ popd %__cc %{optflags} %{ldflags} -o jpegexiforient jpegexiforient.c + %install make install-libLTLIBRARIES DESTDIR=%{buildroot} -C jpeg62 %makeinstall_std -C jpeg8 From d96ac9ad59604eba10ab63900fd05550cd0a9d9f Mon Sep 17 00:00:00 2001 From: Giovanni Mariani Date: Wed, 1 Aug 2018 16:27:39 +0200 Subject: [PATCH 02/32] Updated to release 2.0.0, rediffed P1 and added P2 to fix library path, fixed file lists --- .abf.yml | 2 +- libjpeg-turbo-1.5.2-CVE-2017-15232.patch | 53 ------------------------ libjpeg-turbo-2.0.0-CVE-2017-15232.patch | 40 ++++++++++++++++++ libjpeg-turbo-2.0.0-fix-lib-path.patch | 12 ++++++ libjpeg-turbo.spec | 35 +++++++++------- 5 files changed, 73 insertions(+), 69 deletions(-) delete mode 100644 libjpeg-turbo-1.5.2-CVE-2017-15232.patch create mode 100644 libjpeg-turbo-2.0.0-CVE-2017-15232.patch create mode 100644 libjpeg-turbo-2.0.0-fix-lib-path.patch diff --git a/.abf.yml b/.abf.yml index ce19d84..51872f1 100644 --- a/.abf.yml +++ b/.abf.yml @@ -1,2 +1,2 @@ sources: - libjpeg-turbo-1.5.3.tar.gz: 87ebf4cab2bb27fcb8e7ccb18ec4eb680e1f2c2d + libjpeg-turbo-2.0.0.tar.gz: fe49aea935617748c21ecbe46c986d6c1b98f39b diff --git a/libjpeg-turbo-1.5.2-CVE-2017-15232.patch b/libjpeg-turbo-1.5.2-CVE-2017-15232.patch deleted file mode 100644 index f67b1bf..0000000 --- a/libjpeg-turbo-1.5.2-CVE-2017-15232.patch +++ /dev/null @@ -1,53 +0,0 @@ -From 1ecd9a5729d78518397889a630e3534bd9d963a8 Mon Sep 17 00:00:00 2001 -From: Kornel -Date: Sat, 30 Sep 2017 12:05:53 +0100 -Subject: [PATCH] Handle NULL buffer when discarding rows - ---- - jdpostct.c | 5 +++++ - jquant1.c | 4 ++++ - 2 files changed, 9 insertions(+) - -diff --git a/jdpostct.c b/jdpostct.c -index 601fc2a79..a24202ca9 100644 ---- a/jdpostct.c -+++ b/jdpostct.c -@@ -132,6 +132,11 @@ post_process_1pass (j_decompress_ptr cinfo, - my_post_ptr post = (my_post_ptr) cinfo->post; - JDIMENSION num_rows, max_rows; - -+ /* read_and_discard_scanlines may call it with rows "available", but no buffer */ -+ if (output_buf == NULL) { -+ return; -+ } -+ - /* Fill the buffer, but not more than what we can dump out in one go. */ - /* Note we rely on the upsampler to detect bottom of image. */ - max_rows = out_rows_avail - *out_row_ctr; -diff --git a/jquant1.c b/jquant1.c -index e7814815e..ba2ea9b80 100644 ---- a/jquant1.c -+++ b/jquant1.c -@@ -531,6 +531,10 @@ quantize_ord_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf, - JDIMENSION col; - JDIMENSION width = cinfo->output_width; - -+ if (output_buf == NULL && num_rows) { -+ ERREXIT(cinfo, JERR_BAD_PARAM); -+ } -+ - for (row = 0; row < num_rows; row++) { - /* Initialize output values to 0 so can process components separately */ - jzero_far((void *) output_buf[row], (size_t) (width * sizeof(JSAMPLE))); -Index: libjpeg-turbo-1.5.2/jerror.h -=================================================================== ---- libjpeg-turbo-1.5.2.orig/jerror.h 2017-07-07 22:31:10.000000000 +0200 -+++ libjpeg-turbo-1.5.2/jerror.h 2017-10-12 13:24:01.349954012 +0200 -@@ -208,6 +208,7 @@ JMESSAGE(JERR_NO_ARITH_TABLE, "Arithmeti - JMESSAGE(JWRN_ARITH_BAD_CODE, "Corrupt JPEG data: bad arithmetic code") - #endif - #endif -+JMESSAGE(JERR_BAD_PARAM, "Bogus parameter") - - #ifdef JMAKE_ENUM_LIST - diff --git a/libjpeg-turbo-2.0.0-CVE-2017-15232.patch b/libjpeg-turbo-2.0.0-CVE-2017-15232.patch new file mode 100644 index 0000000..22cb61b --- /dev/null +++ b/libjpeg-turbo-2.0.0-CVE-2017-15232.patch @@ -0,0 +1,40 @@ +diff -rupN libjpeg-turbo-2.0.0.old/jdpostct.c libjpeg-turbo-2.0.0/jdpostct.c +--- libjpeg-turbo-2.0.0.old/jdpostct.c 2018-07-27 18:47:48.000000000 +0200 ++++ libjpeg-turbo-2.0.0/jdpostct.c 2018-08-01 16:10:01.647692751 +0200 +@@ -137,6 +137,11 @@ post_process_1pass(j_decompress_ptr cinf + my_post_ptr post = (my_post_ptr)cinfo->post; + JDIMENSION num_rows, max_rows; + ++ /* read_and_discard_scanlines may call it with rows "available", but no buffer */ ++ if (output_buf == NULL) { ++ return; ++ } ++ + /* Fill the buffer, but not more than what we can dump out in one go. */ + /* Note we rely on the upsampler to detect bottom of image. */ + max_rows = out_rows_avail - *out_row_ctr; +diff -rupN libjpeg-turbo-2.0.0.old/jerror.h libjpeg-turbo-2.0.0/jerror.h +--- libjpeg-turbo-2.0.0.old/jerror.h 2018-07-27 18:47:48.000000000 +0200 ++++ libjpeg-turbo-2.0.0/jerror.h 2018-08-01 16:11:31.554054434 +0200 +@@ -207,6 +207,7 @@ JMESSAGE(JWRN_ARITH_BAD_CODE, "Corrupt J + #endif + #endif + JMESSAGE(JWRN_BOGUS_ICC, "Corrupt JPEG data: bad ICC marker") ++JMESSAGE(JERR_BAD_PARAM, "Bogus parameter") + + #ifdef JMAKE_ENUM_LIST + +diff -rupN libjpeg-turbo-2.0.0.old/jquant1.c libjpeg-turbo-2.0.0/jquant1.c +--- libjpeg-turbo-2.0.0.old/jquant1.c 2018-07-27 18:47:48.000000000 +0200 ++++ libjpeg-turbo-2.0.0/jquant1.c 2018-08-01 16:12:44.437913930 +0200 +@@ -532,6 +532,10 @@ quantize_ord_dither(j_decompress_ptr cin + JDIMENSION col; + JDIMENSION width = cinfo->output_width; + ++ if (output_buf == NULL && num_rows) { ++ ERREXIT(cinfo, JERR_BAD_PARAM); ++ } ++ + for (row = 0; row < num_rows; row++) { + /* Initialize output values to 0 so can process components separately */ + jzero_far((void *)output_buf[row], (size_t)(width * sizeof(JSAMPLE))); diff --git a/libjpeg-turbo-2.0.0-fix-lib-path.patch b/libjpeg-turbo-2.0.0-fix-lib-path.patch new file mode 100644 index 0000000..d0ba187 --- /dev/null +++ b/libjpeg-turbo-2.0.0-fix-lib-path.patch @@ -0,0 +1,12 @@ +diff -rupN libjpeg-turbo-2.0.0.old/CMakeLists.txt libjpeg-turbo-2.0.0/CMakeLists.txt +--- libjpeg-turbo-2.0.0.old/CMakeLists.txt 2018-07-27 18:47:48.000000000 +0200 ++++ libjpeg-turbo-2.0.0/CMakeLists.txt 2018-08-01 16:24:47.625312813 +0200 +@@ -102,7 +102,7 @@ if(CMAKE_INSTALL_PREFIX STREQUAL "${CMAK + if(BITS EQUAL 64) + set(CMAKE_INSTALL_DEFAULT_LIBDIR "lib64") + else() +- set(CMAKE_INSTALL_DEFAULT_LIBDIR "lib32") ++ set(CMAKE_INSTALL_DEFAULT_LIBDIR "lib") + endif() + endif() + endif() diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec index e2d9ff5..9e12857 100644 --- a/libjpeg-turbo.spec +++ b/libjpeg-turbo.spec @@ -12,7 +12,7 @@ Summary: A MMX/SSE2 accelerated library for manipulating JPEG image files Name: libjpeg-turbo Epoch: 1 -Version: 1.5.3 +Version: 2.0.0 Release: 1 License: wxWindows Library License Group: System/Libraries @@ -25,7 +25,9 @@ Source0: https://sourceforge.net/projects/libjpeg-turbo/files/%{version}/%{name} Source2: http://jpegclub.org/jpegexiforient.c Source3: http://jpegclub.org/exifautotran.txt Patch0: jpeg-6b-c++fixes.patch -Patch1: %{name}-1.5.2-CVE-2017-15232.patch +Patch1: %{name}-2.0.0-CVE-2017-15232.patch +Patch2: %{name}-2.0.0-fix-lib-path.patch +BuildRequires: cmake BuildRequires: libtool >= 1.4 %ifarch %{ix86} x86_64 BuildRequires: nasm @@ -99,7 +101,7 @@ developing programs which will manipulate JPEG files using the libjpeg library. %files -n %{devname} -%doc coderules.txt example.c jconfig.txt libjpeg.txt structure.txt +%doc coderules.txt jconfig.txt libjpeg.txt structure.txt %{_libdir}/libjpeg.so %{_libdir}/libturbojpeg.so %{_includedir}/*.h @@ -159,39 +161,42 @@ have orientation markings in the EXIF data. %setup -q %patch0 -p0 %patch1 -p1 +%patch2 -p1 cp %{SOURCE2} jpegexiforient.c cp %{SOURCE3} exifautotran %build -CONFIGURE_TOP="$PWD" - +# Prepare build dirs mkdir -p jpeg8 +mkdir -p jpeg62 + +# Build jpeg v8 API pushd jpeg8 CFLAGS="%{optflags} -Ofast -funroll-loops" \ -%configure2_5x \ - --enable-shared \ - --enable-static \ - --with-jpeg8 +%cmake ../.. -DWITH_JPEG8="True" + %make popd -mkdir -p jpeg62 +# Build jpeg v6.2 API pushd jpeg62 CFLAGS="%{optflags} -Ofast -funroll-loops" \ -%configure2_5x \ - --enable-shared \ - --disable-static +%cmake ../.. \ + -DWITH_ARITH_DEC="True" \ + -DWITH_ARITH_ENC="True" + %make popd +# Build jpegexiforient binary %__cc %{optflags} %{ldflags} -o jpegexiforient jpegexiforient.c %install -make install-libLTLIBRARIES DESTDIR=%{buildroot} -C jpeg62 -%makeinstall_std -C jpeg8 +%makeinstall_std -C jpeg8/build +%makeinstall_std -C jpeg62/build install -m755 jpegexiforient -D %{buildroot}%{_bindir}/jpegexiforient install -m755 exifautotran -D %{buildroot}%{_bindir}/exifautotran From 9b6ee512d6232b9ea08f6210fec8240984a854b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BB=D0=B5=D0=BA=D1=81=D0=B5=D0=B9-=D0=97?= Date: Sun, 12 Aug 2018 11:57:01 +0300 Subject: [PATCH 03/32] Added requires %{libname62} in %{devname} --- libjpeg-turbo.spec | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec index 9e12857..9a3e9cc 100644 --- a/libjpeg-turbo.spec +++ b/libjpeg-turbo.spec @@ -13,7 +13,7 @@ Summary: A MMX/SSE2 accelerated library for manipulating JPEG image files Name: libjpeg-turbo Epoch: 1 Version: 2.0.0 -Release: 1 +Release: 2 License: wxWindows Library License Group: System/Libraries Url: http://www.libjpeg-turbo.org @@ -88,12 +88,12 @@ with libturbojpeg. Summary: Development tools for programs which will use the libjpeg library Group: Development/C Requires: %{libname} = %{EVRD} +Requires: %{libname62} = %{EVRD} Requires: %{libturbo} = %{EVRD} Provides: jpeg-devel = %{EVRD} Conflicts: jpeg6-devel Conflicts: %{_lib}turbojpeg < 1:1.3.0 Obsoletes: %{_lib}turbojpeg < 1:1.3.0 -Obsoletes: %{mklibname jpeg 62 -d} < 6b-45 %description -n %{devname} The libjpeg-turbo devel package includes the header files necessary for From 10ceb7e23e0e8acc7c970dcb96b498ce1d4adc85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BB=D0=B7=D0=B8=D0=BC?= Date: Tue, 13 Nov 2018 00:45:03 +0300 Subject: [PATCH 04/32] Updated to 2.0.1 --- .abf.yml | 2 +- libjpeg-turbo.spec | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.abf.yml b/.abf.yml index 51872f1..ebd867b 100644 --- a/.abf.yml +++ b/.abf.yml @@ -1,2 +1,2 @@ sources: - libjpeg-turbo-2.0.0.tar.gz: fe49aea935617748c21ecbe46c986d6c1b98f39b + libjpeg-turbo-2.0.1.tar.gz: 7ea4a288bccbb5a2d5bfad5fb328d4a839853f4e diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec index 9a3e9cc..1224f2d 100644 --- a/libjpeg-turbo.spec +++ b/libjpeg-turbo.spec @@ -1,7 +1,7 @@ -%define major 8 -%define libname %mklibname jpeg %{major} -%define devname %mklibname jpeg -d -%define sdevname %mklibname jpeg -d -s +%define major 8 +%define libname %mklibname jpeg %{major} +%define devname %mklibname jpeg -d +%define sdevname %mklibname jpeg -d -s %define majorturbo 0 %define libturbo %mklibname turbojpeg %{majorturbo} @@ -12,8 +12,8 @@ Summary: A MMX/SSE2 accelerated library for manipulating JPEG image files Name: libjpeg-turbo Epoch: 1 -Version: 2.0.0 -Release: 2 +Version: 2.0.1 +Release: 1 License: wxWindows Library License Group: System/Libraries Url: http://www.libjpeg-turbo.org @@ -177,7 +177,7 @@ pushd jpeg8 CFLAGS="%{optflags} -Ofast -funroll-loops" \ %cmake ../.. -DWITH_JPEG8="True" -%make +%make -s popd # Build jpeg v6.2 API @@ -187,7 +187,7 @@ CFLAGS="%{optflags} -Ofast -funroll-loops" \ -DWITH_ARITH_DEC="True" \ -DWITH_ARITH_ENC="True" -%make +%make -s popd # Build jpegexiforient binary From 79a3e80f5ad4030b12293a0e0cc0854f5622aedf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BB=D0=B7=D0=B8=D0=BC?= Date: Thu, 14 Feb 2019 13:20:22 +0300 Subject: [PATCH 05/32] Updated to 2.0.2 --- .abf.yml | 4 +++- libjpeg-turbo.spec | 6 +++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/.abf.yml b/.abf.yml index ebd867b..55b3dbf 100644 --- a/.abf.yml +++ b/.abf.yml @@ -1,2 +1,4 @@ -sources: +removed_sources: libjpeg-turbo-2.0.1.tar.gz: 7ea4a288bccbb5a2d5bfad5fb328d4a839853f4e +sources: + libjpeg-turbo-2.0.2.tar.gz: 1cff52d50b81755d0bdcf9055eb22157f39a1695 diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec index 1224f2d..cc00bb8 100644 --- a/libjpeg-turbo.spec +++ b/libjpeg-turbo.spec @@ -12,7 +12,7 @@ Summary: A MMX/SSE2 accelerated library for manipulating JPEG image files Name: libjpeg-turbo Epoch: 1 -Version: 2.0.1 +Version: 2.0.2 Release: 1 License: wxWindows Library License Group: System/Libraries @@ -184,8 +184,8 @@ popd pushd jpeg62 CFLAGS="%{optflags} -Ofast -funroll-loops" \ %cmake ../.. \ - -DWITH_ARITH_DEC="True" \ - -DWITH_ARITH_ENC="True" + -DWITH_ARITH_DEC="True" \ + -DWITH_ARITH_ENC="True" %make -s popd From 2c3dd24fe0bc4a0bc8d5d08f1e6fd9513e0ee885 Mon Sep 17 00:00:00 2001 From: Andrey Bondrov Date: Thu, 4 Apr 2019 17:15:44 +1000 Subject: [PATCH 06/32] Bump release --- libjpeg-turbo.spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec index cc00bb8..95de94c 100644 --- a/libjpeg-turbo.spec +++ b/libjpeg-turbo.spec @@ -13,7 +13,7 @@ Summary: A MMX/SSE2 accelerated library for manipulating JPEG image files Name: libjpeg-turbo Epoch: 1 Version: 2.0.2 -Release: 1 +Release: 2 License: wxWindows Library License Group: System/Libraries Url: http://www.libjpeg-turbo.org From fe3e4ab0bc9ebac814db1af7305660a1ee4d5172 Mon Sep 17 00:00:00 2001 From: Andrey Bondrov Date: Thu, 4 Apr 2019 20:56:28 +1000 Subject: [PATCH 07/32] Bump release --- libjpeg-turbo.spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec index 95de94c..d5163de 100644 --- a/libjpeg-turbo.spec +++ b/libjpeg-turbo.spec @@ -13,7 +13,7 @@ Summary: A MMX/SSE2 accelerated library for manipulating JPEG image files Name: libjpeg-turbo Epoch: 1 Version: 2.0.2 -Release: 2 +Release: 3 License: wxWindows Library License Group: System/Libraries Url: http://www.libjpeg-turbo.org From efdc76f37fa713f3a757e163419e2a747a672283 Mon Sep 17 00:00:00 2001 From: Mikhail Novosyolov Date: Sat, 6 Apr 2019 23:53:10 +0300 Subject: [PATCH 08/32] Rebuild with rpm-5.4.10-86 where pkgconfig private dependencies are parsed (RB#9569) (rebuilding dependency chain of qt4) --- libjpeg-turbo.spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec index d5163de..90717e9 100644 --- a/libjpeg-turbo.spec +++ b/libjpeg-turbo.spec @@ -13,7 +13,7 @@ Summary: A MMX/SSE2 accelerated library for manipulating JPEG image files Name: libjpeg-turbo Epoch: 1 Version: 2.0.2 -Release: 3 +Release: 4 License: wxWindows Library License Group: System/Libraries Url: http://www.libjpeg-turbo.org From ee29827a05f3429c5e071398fae27458761c6e98 Mon Sep 17 00:00:00 2001 From: Date: Sun, 7 Apr 2019 10:28:09 +0000 Subject: [PATCH 09/32] MassBuild#1666: Increase release tag --- libjpeg-turbo.spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec index 90717e9..567ff2d 100644 --- a/libjpeg-turbo.spec +++ b/libjpeg-turbo.spec @@ -13,7 +13,7 @@ Summary: A MMX/SSE2 accelerated library for manipulating JPEG image files Name: libjpeg-turbo Epoch: 1 Version: 2.0.2 -Release: 4 +Release: 5 License: wxWindows Library License Group: System/Libraries Url: http://www.libjpeg-turbo.org From daad40f0943c37257eb4b5a47660b815b25ffccf Mon Sep 17 00:00:00 2001 From: Andrey Bondrov Date: Sun, 14 Apr 2019 01:46:46 +0000 Subject: [PATCH 10/32] MassBuild#1671: Increase release tag --- libjpeg-turbo.spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec index 567ff2d..1a09bed 100644 --- a/libjpeg-turbo.spec +++ b/libjpeg-turbo.spec @@ -13,7 +13,7 @@ Summary: A MMX/SSE2 accelerated library for manipulating JPEG image files Name: libjpeg-turbo Epoch: 1 Version: 2.0.2 -Release: 5 +Release: 6 License: wxWindows Library License Group: System/Libraries Url: http://www.libjpeg-turbo.org From 06a4c164ab9d6a354d85303b0d9994b0cbd11d46 Mon Sep 17 00:00:00 2001 From: Date: Sat, 6 Jul 2019 18:20:51 +0000 Subject: [PATCH 11/32] MassBuild#1715: Increase release tag --- libjpeg-turbo.spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec index 1a09bed..19a5028 100644 --- a/libjpeg-turbo.spec +++ b/libjpeg-turbo.spec @@ -13,7 +13,7 @@ Summary: A MMX/SSE2 accelerated library for manipulating JPEG image files Name: libjpeg-turbo Epoch: 1 Version: 2.0.2 -Release: 6 +Release: 7 License: wxWindows Library License Group: System/Libraries Url: http://www.libjpeg-turbo.org From b5784d7862cb268d45236ba747543283b0cbbeeb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BB=D0=B7=D0=B8=D0=BC?= Date: Mon, 9 Sep 2019 22:32:08 +0300 Subject: [PATCH 12/32] Updated to 2.0.3 --- .abf.yml | 4 +--- libjpeg-turbo.spec | 4 ++-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/.abf.yml b/.abf.yml index 55b3dbf..b87805f 100644 --- a/.abf.yml +++ b/.abf.yml @@ -1,4 +1,2 @@ -removed_sources: - libjpeg-turbo-2.0.1.tar.gz: 7ea4a288bccbb5a2d5bfad5fb328d4a839853f4e sources: - libjpeg-turbo-2.0.2.tar.gz: 1cff52d50b81755d0bdcf9055eb22157f39a1695 + libjpeg-turbo-2.0.3.tar.gz: 539363a444f92421c098a1a3e7cebfda48d4cfb3 diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec index 19a5028..a22e3ab 100644 --- a/libjpeg-turbo.spec +++ b/libjpeg-turbo.spec @@ -12,8 +12,8 @@ Summary: A MMX/SSE2 accelerated library for manipulating JPEG image files Name: libjpeg-turbo Epoch: 1 -Version: 2.0.2 -Release: 7 +Version: 2.0.3 +Release: 1 License: wxWindows Library License Group: System/Libraries Url: http://www.libjpeg-turbo.org From 68b40169300fb55b3808714f69926cc10ea02162 Mon Sep 17 00:00:00 2001 From: Andrey Bondrov Date: Thu, 14 Nov 2019 16:18:23 +1000 Subject: [PATCH 13/32] Add patch to fix CVE-2019-2201 --- libjpeg-turbo-2.0.3-CVE-2019-2201.patch | 54 +++++++++++++++++++++++++ libjpeg-turbo.spec | 8 ++-- 2 files changed, 59 insertions(+), 3 deletions(-) create mode 100644 libjpeg-turbo-2.0.3-CVE-2019-2201.patch diff --git a/libjpeg-turbo-2.0.3-CVE-2019-2201.patch b/libjpeg-turbo-2.0.3-CVE-2019-2201.patch new file mode 100644 index 0000000..78483a7 --- /dev/null +++ b/libjpeg-turbo-2.0.3-CVE-2019-2201.patch @@ -0,0 +1,54 @@ +From 6548377e7ed7653e555dac0b9faa4152b00ecbaf Mon Sep 17 00:00:00 2001 +From: DRC +Date: Tue, 12 Nov 2019 12:11:54 -0800 +Subject: [PATCH] 64-bit tjbench: Fix signed int overflow/segfault + +... that occurred when attempting to decompress images with more than +715827882 (2048*1024*1024 / 3) pixels. + +Fixes #388 +--- + ChangeLog.md | 4 ++++ + tjbench.c | 4 ++-- + 2 files changed, 6 insertions(+), 2 deletions(-) + +diff --git a/ChangeLog.md b/ChangeLog.md +index f5fe44b..4e501c5 100644 +--- a/ChangeLog.md ++++ b/ChangeLog.md +@@ -41,6 +41,10 @@ end of a single-scan (non-progressive) image, subsequent calls to + JPEG images that were compressed with a sampling factor other than 1 (for + instance, with `cjpeg -grayscale -sample 2x2`). + ++2. Fixed a signed integer overflow and subsequent segfault that occurred when ++attempting to decompress images with more than 715827882 pixels using the ++64-bit C version of TJBench. ++ + + 1.5.2 + ===== +diff --git a/tjbench.c b/tjbench.c +index 76b61cd..393c014 100644 +--- a/tjbench.c ++++ b/tjbench.c +@@ -133,7 +133,7 @@ int decomp(unsigned char *srcbuf, unsigned char **jpegbuf, + } + /* Set the destination buffer to gray so we know whether the decompressor + attempted to write to it */ +- memset(dstbuf, 127, pitch*scaledh); ++ memset(dstbuf, 127, (size_t)pitch * scaledh); + + if(doyuv) + { +@@ -152,7 +152,7 @@ int decomp(unsigned char *srcbuf, unsigned char **jpegbuf, + { + int tile=0; + double start=gettime(); +- for(row=0, dstptr=dstbuf; row= 1.4 %ifarch %{ix86} x86_64 @@ -162,6 +163,7 @@ have orientation markings in the EXIF data. %patch0 -p0 %patch1 -p1 %patch2 -p1 +%patch3 -p1 cp %{SOURCE2} jpegexiforient.c cp %{SOURCE3} exifautotran From a6178c6cc27cdc59b50d784c8693b2cafd775651 Mon Sep 17 00:00:00 2001 From: Andrey Bondrov Date: Thu, 14 Nov 2019 16:30:23 +1000 Subject: [PATCH 14/32] Use fix for CVE-2019-2201 for this version, not for 1.5.3 --- libjpeg-turbo-2.0.3-CVE-2019-2201.patch | 53 ++++++++++++------------- 1 file changed, 25 insertions(+), 28 deletions(-) diff --git a/libjpeg-turbo-2.0.3-CVE-2019-2201.patch b/libjpeg-turbo-2.0.3-CVE-2019-2201.patch index 78483a7..6cc70b7 100644 --- a/libjpeg-turbo-2.0.3-CVE-2019-2201.patch +++ b/libjpeg-turbo-2.0.3-CVE-2019-2201.patch @@ -1,6 +1,6 @@ -From 6548377e7ed7653e555dac0b9faa4152b00ecbaf Mon Sep 17 00:00:00 2001 +From c30b1e72dac76343ef9029833d1561de07d29bad Mon Sep 17 00:00:00 2001 From: DRC -Date: Tue, 12 Nov 2019 12:11:54 -0800 +Date: Tue, 12 Nov 2019 12:27:22 -0600 Subject: [PATCH] 64-bit tjbench: Fix signed int overflow/segfault ... that occurred when attempting to decompress images with more than @@ -13,42 +13,39 @@ Fixes #388 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/ChangeLog.md b/ChangeLog.md -index f5fe44b..4e501c5 100644 +index 4d6960f3d..da160f0f1 100644 --- a/ChangeLog.md +++ b/ChangeLog.md -@@ -41,6 +41,10 @@ end of a single-scan (non-progressive) image, subsequent calls to - JPEG images that were compressed with a sampling factor other than 1 (for - instance, with `cjpeg -grayscale -sample 2x2`). +@@ -8,6 +8,10 @@ + 64-bit libjpeg-turbo SDK for Visual C++ were installed on the same system, only + one of them could be uninstalled. +2. Fixed a signed integer overflow and subsequent segfault that occurred when +attempting to decompress images with more than 715827882 pixels using the +64-bit C version of TJBench. + - 1.5.2 + 2.0.3 ===== diff --git a/tjbench.c b/tjbench.c -index 76b61cd..393c014 100644 +index a7d397318..13a5bde62 100644 --- a/tjbench.c +++ b/tjbench.c -@@ -133,7 +133,7 @@ int decomp(unsigned char *srcbuf, unsigned char **jpegbuf, - } - /* Set the destination buffer to gray so we know whether the decompressor - attempted to write to it */ -- memset(dstbuf, 127, pitch*scaledh); -+ memset(dstbuf, 127, (size_t)pitch * scaledh); +@@ -171,7 +171,7 @@ static int decomp(unsigned char *srcBuf, unsigned char **jpegBuf, + } + /* Set the destination buffer to gray so we know whether the decompressor + attempted to write to it */ +- memset(dstBuf, 127, pitch * scaledh); ++ memset(dstBuf, 127, (size_t)pitch * scaledh); - if(doyuv) - { -@@ -152,7 +152,7 @@ int decomp(unsigned char *srcbuf, unsigned char **jpegbuf, - { - int tile=0; - double start=gettime(); -- for(row=0, dstptr=dstbuf; row Date: Sat, 4 Jan 2020 04:18:41 +0300 Subject: [PATCH 15/32] bot: rpm5 -> rpm4 (1) --- libjpeg-turbo.spec | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec index 0da4f1d..88dbdee 100644 --- a/libjpeg-turbo.spec +++ b/libjpeg-turbo.spec @@ -179,7 +179,7 @@ pushd jpeg8 CFLAGS="%{optflags} -Ofast -funroll-loops" \ %cmake ../.. -DWITH_JPEG8="True" -%make -s +%make_build -s popd # Build jpeg v6.2 API @@ -189,7 +189,7 @@ CFLAGS="%{optflags} -Ofast -funroll-loops" \ -DWITH_ARITH_DEC="True" \ -DWITH_ARITH_ENC="True" -%make -s +%make_build -s popd # Build jpegexiforient binary @@ -197,8 +197,8 @@ popd %install -%makeinstall_std -C jpeg8/build -%makeinstall_std -C jpeg62/build +%make_install -C jpeg8/build +%make_install -C jpeg62/build install -m755 jpegexiforient -D %{buildroot}%{_bindir}/jpegexiforient install -m755 exifautotran -D %{buildroot}%{_bindir}/exifautotran From 1468df33daddd7b3e93fe656de3a7b86bc79f95a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BB=D0=B7=D0=B8=D0=BC?= Date: Tue, 7 Jan 2020 21:48:40 +0300 Subject: [PATCH 16/32] Updated to 2.0.4 --- .abf.yml | 2 +- libjpeg-turbo-2.0.3-CVE-2019-2201.patch | 51 ------------------------- libjpeg-turbo.spec | 6 +-- 3 files changed, 3 insertions(+), 56 deletions(-) delete mode 100644 libjpeg-turbo-2.0.3-CVE-2019-2201.patch diff --git a/.abf.yml b/.abf.yml index b87805f..67c52a0 100644 --- a/.abf.yml +++ b/.abf.yml @@ -1,2 +1,2 @@ sources: - libjpeg-turbo-2.0.3.tar.gz: 539363a444f92421c098a1a3e7cebfda48d4cfb3 + libjpeg-turbo-2.0.4.tar.gz: 163d8f96d0999526a117de0388624241b54dcd67 diff --git a/libjpeg-turbo-2.0.3-CVE-2019-2201.patch b/libjpeg-turbo-2.0.3-CVE-2019-2201.patch deleted file mode 100644 index 6cc70b7..0000000 --- a/libjpeg-turbo-2.0.3-CVE-2019-2201.patch +++ /dev/null @@ -1,51 +0,0 @@ -From c30b1e72dac76343ef9029833d1561de07d29bad Mon Sep 17 00:00:00 2001 -From: DRC -Date: Tue, 12 Nov 2019 12:27:22 -0600 -Subject: [PATCH] 64-bit tjbench: Fix signed int overflow/segfault - -... that occurred when attempting to decompress images with more than -715827882 (2048*1024*1024 / 3) pixels. - -Fixes #388 ---- - ChangeLog.md | 4 ++++ - tjbench.c | 4 ++-- - 2 files changed, 6 insertions(+), 2 deletions(-) - -diff --git a/ChangeLog.md b/ChangeLog.md -index 4d6960f3d..da160f0f1 100644 ---- a/ChangeLog.md -+++ b/ChangeLog.md -@@ -8,6 +8,10 @@ - 64-bit libjpeg-turbo SDK for Visual C++ were installed on the same system, only - one of them could be uninstalled. - -+2. Fixed a signed integer overflow and subsequent segfault that occurred when -+attempting to decompress images with more than 715827882 pixels using the -+64-bit C version of TJBench. -+ - - 2.0.3 - ===== -diff --git a/tjbench.c b/tjbench.c -index a7d397318..13a5bde62 100644 ---- a/tjbench.c -+++ b/tjbench.c -@@ -171,7 +171,7 @@ static int decomp(unsigned char *srcBuf, unsigned char **jpegBuf, - } - /* Set the destination buffer to gray so we know whether the decompressor - attempted to write to it */ -- memset(dstBuf, 127, pitch * scaledh); -+ memset(dstBuf, 127, (size_t)pitch * scaledh); - - if (doYUV) { - int width = doTile ? tilew : scaledw; -@@ -193,7 +193,7 @@ static int decomp(unsigned char *srcBuf, unsigned char **jpegBuf, - double start = getTime(); - - for (row = 0, dstPtr = dstBuf; row < ntilesh; -- row++, dstPtr += pitch * tileh) { -+ row++, dstPtr += (size_t)pitch * tileh) { - for (col = 0, dstPtr2 = dstPtr; col < ntilesw; - col++, tile++, dstPtr2 += ps * tilew) { - int width = doTile ? min(tilew, w - col * tilew) : scaledw; diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec index 88dbdee..2818af0 100644 --- a/libjpeg-turbo.spec +++ b/libjpeg-turbo.spec @@ -12,8 +12,8 @@ Summary: A MMX/SSE2 accelerated library for manipulating JPEG image files Name: libjpeg-turbo Epoch: 1 -Version: 2.0.3 -Release: 2 +Version: 2.0.4 +Release: 1 License: wxWindows Library License Group: System/Libraries Url: http://www.libjpeg-turbo.org @@ -27,7 +27,6 @@ Source3: http://jpegclub.org/exifautotran.txt Patch0: jpeg-6b-c++fixes.patch Patch1: libjpeg-turbo-2.0.0-CVE-2017-15232.patch Patch2: libjpeg-turbo-2.0.0-fix-lib-path.patch -Patch3: libjpeg-turbo-2.0.3-CVE-2019-2201.patch BuildRequires: cmake BuildRequires: libtool >= 1.4 %ifarch %{ix86} x86_64 @@ -163,7 +162,6 @@ have orientation markings in the EXIF data. %patch0 -p0 %patch1 -p1 %patch2 -p1 -%patch3 -p1 cp %{SOURCE2} jpegexiforient.c cp %{SOURCE3} exifautotran From 7196447d0787551b13098c5ad51a3c66fc78af6e Mon Sep 17 00:00:00 2001 From: NixTux Commit Bot Date: Mon, 13 Jan 2020 15:46:22 +0300 Subject: [PATCH 17/32] bot: rpm5 -> rpm4 (8) --- libjpeg-turbo.spec | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec index 2818af0..872cce0 100644 --- a/libjpeg-turbo.spec +++ b/libjpeg-turbo.spec @@ -177,7 +177,7 @@ pushd jpeg8 CFLAGS="%{optflags} -Ofast -funroll-loops" \ %cmake ../.. -DWITH_JPEG8="True" -%make_build -s +%make -s popd # Build jpeg v6.2 API @@ -187,7 +187,7 @@ CFLAGS="%{optflags} -Ofast -funroll-loops" \ -DWITH_ARITH_DEC="True" \ -DWITH_ARITH_ENC="True" -%make_build -s +%make -s popd # Build jpegexiforient binary @@ -195,8 +195,8 @@ popd %install -%make_install -C jpeg8/build -%make_install -C jpeg62/build +%makeinstall_std -C jpeg8/build +%makeinstall_std -C jpeg62/build install -m755 jpegexiforient -D %{buildroot}%{_bindir}/jpegexiforient install -m755 exifautotran -D %{buildroot}%{_bindir}/exifautotran From 19d61b646de522fbf20f74d4ce679fad6b5b4dbe Mon Sep 17 00:00:00 2001 From: Alzim Date: Wed, 24 Jun 2020 10:08:57 +0300 Subject: [PATCH 18/32] Updated to 2.0.5 --- .abf.yml | 2 +- libjpeg-turbo.spec | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.abf.yml b/.abf.yml index 67c52a0..952eb71 100644 --- a/.abf.yml +++ b/.abf.yml @@ -1,2 +1,2 @@ sources: - libjpeg-turbo-2.0.4.tar.gz: 163d8f96d0999526a117de0388624241b54dcd67 + libjpeg-turbo-2.0.5.tar.gz: 9d4c565d402b2f5661be78d76098073ec7e30f10 diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec index 872cce0..d01e602 100644 --- a/libjpeg-turbo.spec +++ b/libjpeg-turbo.spec @@ -12,7 +12,7 @@ Summary: A MMX/SSE2 accelerated library for manipulating JPEG image files Name: libjpeg-turbo Epoch: 1 -Version: 2.0.4 +Version: 2.0.5 Release: 1 License: wxWindows Library License Group: System/Libraries From eaf934059314f191eb157e704b512f91f288d5e0 Mon Sep 17 00:00:00 2001 From: Alexander Stefanov Date: Wed, 18 Nov 2020 20:32:40 +0000 Subject: [PATCH 19/32] version autoupdate [2.0.6] --- .abf.yml | 2 +- libjpeg-turbo.spec | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.abf.yml b/.abf.yml index 952eb71..41506b4 100644 --- a/.abf.yml +++ b/.abf.yml @@ -1,2 +1,2 @@ sources: - libjpeg-turbo-2.0.5.tar.gz: 9d4c565d402b2f5661be78d76098073ec7e30f10 + libjpeg-turbo-2.0.6.tar.gz: 5406c7676d7df89fb4da791ad5af51202910fb25 diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec index d01e602..9575536 100644 --- a/libjpeg-turbo.spec +++ b/libjpeg-turbo.spec @@ -12,7 +12,7 @@ Summary: A MMX/SSE2 accelerated library for manipulating JPEG image files Name: libjpeg-turbo Epoch: 1 -Version: 2.0.5 +Version: 2.0.6 Release: 1 License: wxWindows Library License Group: System/Libraries From c4ad17884483776110cb6ddfec22dbb48d1096ed Mon Sep 17 00:00:00 2001 From: Alexander Stefanov Date: Sun, 25 Apr 2021 15:15:23 +0000 Subject: [PATCH 20/32] 2.1.0 --- .abf.yml | 2 +- libjpeg-turbo-2.0.0-CVE-2017-15232.patch | 40 ------------------------ libjpeg-turbo-2.0.0-fix-lib-path.patch | 12 ------- libjpeg-turbo.spec | 7 ++--- 4 files changed, 3 insertions(+), 58 deletions(-) delete mode 100644 libjpeg-turbo-2.0.0-CVE-2017-15232.patch delete mode 100644 libjpeg-turbo-2.0.0-fix-lib-path.patch diff --git a/.abf.yml b/.abf.yml index 41506b4..6ecbf0c 100644 --- a/.abf.yml +++ b/.abf.yml @@ -1,2 +1,2 @@ sources: - libjpeg-turbo-2.0.6.tar.gz: 5406c7676d7df89fb4da791ad5af51202910fb25 + libjpeg-turbo-2.1.0.tar.gz: 6bf63c869105d341011cd4915816de888338231a diff --git a/libjpeg-turbo-2.0.0-CVE-2017-15232.patch b/libjpeg-turbo-2.0.0-CVE-2017-15232.patch deleted file mode 100644 index 22cb61b..0000000 --- a/libjpeg-turbo-2.0.0-CVE-2017-15232.patch +++ /dev/null @@ -1,40 +0,0 @@ -diff -rupN libjpeg-turbo-2.0.0.old/jdpostct.c libjpeg-turbo-2.0.0/jdpostct.c ---- libjpeg-turbo-2.0.0.old/jdpostct.c 2018-07-27 18:47:48.000000000 +0200 -+++ libjpeg-turbo-2.0.0/jdpostct.c 2018-08-01 16:10:01.647692751 +0200 -@@ -137,6 +137,11 @@ post_process_1pass(j_decompress_ptr cinf - my_post_ptr post = (my_post_ptr)cinfo->post; - JDIMENSION num_rows, max_rows; - -+ /* read_and_discard_scanlines may call it with rows "available", but no buffer */ -+ if (output_buf == NULL) { -+ return; -+ } -+ - /* Fill the buffer, but not more than what we can dump out in one go. */ - /* Note we rely on the upsampler to detect bottom of image. */ - max_rows = out_rows_avail - *out_row_ctr; -diff -rupN libjpeg-turbo-2.0.0.old/jerror.h libjpeg-turbo-2.0.0/jerror.h ---- libjpeg-turbo-2.0.0.old/jerror.h 2018-07-27 18:47:48.000000000 +0200 -+++ libjpeg-turbo-2.0.0/jerror.h 2018-08-01 16:11:31.554054434 +0200 -@@ -207,6 +207,7 @@ JMESSAGE(JWRN_ARITH_BAD_CODE, "Corrupt J - #endif - #endif - JMESSAGE(JWRN_BOGUS_ICC, "Corrupt JPEG data: bad ICC marker") -+JMESSAGE(JERR_BAD_PARAM, "Bogus parameter") - - #ifdef JMAKE_ENUM_LIST - -diff -rupN libjpeg-turbo-2.0.0.old/jquant1.c libjpeg-turbo-2.0.0/jquant1.c ---- libjpeg-turbo-2.0.0.old/jquant1.c 2018-07-27 18:47:48.000000000 +0200 -+++ libjpeg-turbo-2.0.0/jquant1.c 2018-08-01 16:12:44.437913930 +0200 -@@ -532,6 +532,10 @@ quantize_ord_dither(j_decompress_ptr cin - JDIMENSION col; - JDIMENSION width = cinfo->output_width; - -+ if (output_buf == NULL && num_rows) { -+ ERREXIT(cinfo, JERR_BAD_PARAM); -+ } -+ - for (row = 0; row < num_rows; row++) { - /* Initialize output values to 0 so can process components separately */ - jzero_far((void *)output_buf[row], (size_t)(width * sizeof(JSAMPLE))); diff --git a/libjpeg-turbo-2.0.0-fix-lib-path.patch b/libjpeg-turbo-2.0.0-fix-lib-path.patch deleted file mode 100644 index d0ba187..0000000 --- a/libjpeg-turbo-2.0.0-fix-lib-path.patch +++ /dev/null @@ -1,12 +0,0 @@ -diff -rupN libjpeg-turbo-2.0.0.old/CMakeLists.txt libjpeg-turbo-2.0.0/CMakeLists.txt ---- libjpeg-turbo-2.0.0.old/CMakeLists.txt 2018-07-27 18:47:48.000000000 +0200 -+++ libjpeg-turbo-2.0.0/CMakeLists.txt 2018-08-01 16:24:47.625312813 +0200 -@@ -102,7 +102,7 @@ if(CMAKE_INSTALL_PREFIX STREQUAL "${CMAK - if(BITS EQUAL 64) - set(CMAKE_INSTALL_DEFAULT_LIBDIR "lib64") - else() -- set(CMAKE_INSTALL_DEFAULT_LIBDIR "lib32") -+ set(CMAKE_INSTALL_DEFAULT_LIBDIR "lib") - endif() - endif() - endif() diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec index 9575536..a4cc0fb 100644 --- a/libjpeg-turbo.spec +++ b/libjpeg-turbo.spec @@ -12,7 +12,7 @@ Summary: A MMX/SSE2 accelerated library for manipulating JPEG image files Name: libjpeg-turbo Epoch: 1 -Version: 2.0.6 +Version: 2.1.0 Release: 1 License: wxWindows Library License Group: System/Libraries @@ -25,8 +25,6 @@ Source0: https://sourceforge.net/projects/libjpeg-turbo/files/%{version}/%{name} Source2: http://jpegclub.org/jpegexiforient.c Source3: http://jpegclub.org/exifautotran.txt Patch0: jpeg-6b-c++fixes.patch -Patch1: libjpeg-turbo-2.0.0-CVE-2017-15232.patch -Patch2: libjpeg-turbo-2.0.0-fix-lib-path.patch BuildRequires: cmake BuildRequires: libtool >= 1.4 %ifarch %{ix86} x86_64 @@ -107,6 +105,7 @@ library. %{_includedir}/*.h %{_libdir}/pkgconfig/libjpeg.pc %{_libdir}/pkgconfig/libturbojpeg.pc +%{_libdir}/cmake/%{name}/*.cmake #---------------------------------------------------------------------------- @@ -160,8 +159,6 @@ have orientation markings in the EXIF data. %prep %setup -q %patch0 -p0 -%patch1 -p1 -%patch2 -p1 cp %{SOURCE2} jpegexiforient.c cp %{SOURCE3} exifautotran From 894f13f78028318e23868e72182c06174f3543f5 Mon Sep 17 00:00:00 2001 From: Andrey Grigorev Date: Wed, 11 Aug 2021 12:07:42 +0000 Subject: [PATCH 21/32] MassBuild#2340: Increase release tag --- libjpeg-turbo.spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec index a4cc0fb..8544378 100644 --- a/libjpeg-turbo.spec +++ b/libjpeg-turbo.spec @@ -13,7 +13,7 @@ Summary: A MMX/SSE2 accelerated library for manipulating JPEG image files Name: libjpeg-turbo Epoch: 1 Version: 2.1.0 -Release: 1 +Release: 2 License: wxWindows Library License Group: System/Libraries Url: http://www.libjpeg-turbo.org From 872d3449185580d7c24c1c286572758bd57520d5 Mon Sep 17 00:00:00 2001 From: Alexander Stefanov Date: Wed, 25 May 2022 20:54:55 +0000 Subject: [PATCH 22/32] version autoupdate [2.1.3] --- .abf.yml | 2 +- libjpeg-turbo.spec | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.abf.yml b/.abf.yml index 6ecbf0c..37b6818 100644 --- a/.abf.yml +++ b/.abf.yml @@ -1,2 +1,2 @@ sources: - libjpeg-turbo-2.1.0.tar.gz: 6bf63c869105d341011cd4915816de888338231a + libjpeg-turbo-2.1.3.tar.gz: 6dec48193bb27e1c07abae8230031ce9ecb1cfec diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec index 8544378..092dd05 100644 --- a/libjpeg-turbo.spec +++ b/libjpeg-turbo.spec @@ -12,8 +12,8 @@ Summary: A MMX/SSE2 accelerated library for manipulating JPEG image files Name: libjpeg-turbo Epoch: 1 -Version: 2.1.0 -Release: 2 +Version: 2.1.3 +Release: 1 License: wxWindows Library License Group: System/Libraries Url: http://www.libjpeg-turbo.org From 6f9e8675f02fd694751ec1542daf77851a80ac1e Mon Sep 17 00:00:00 2001 From: Aleksandr Proklov Date: Sat, 21 Jan 2023 04:51:14 +0300 Subject: [PATCH 23/32] Update version to 2.1.4 --- .abf.yml | 2 +- libjpeg-turbo.spec | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.abf.yml b/.abf.yml index 37b6818..471d715 100644 --- a/.abf.yml +++ b/.abf.yml @@ -1,2 +1,2 @@ sources: - libjpeg-turbo-2.1.3.tar.gz: 6dec48193bb27e1c07abae8230031ce9ecb1cfec + libjpeg-turbo-2.1.4.tar.gz: 5a355c08caa326cef7c2a61e062edfe8dd02ac07 diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec index 092dd05..67966ab 100644 --- a/libjpeg-turbo.spec +++ b/libjpeg-turbo.spec @@ -12,7 +12,7 @@ Summary: A MMX/SSE2 accelerated library for manipulating JPEG image files Name: libjpeg-turbo Epoch: 1 -Version: 2.1.3 +Version: 2.1.4 Release: 1 License: wxWindows Library License Group: System/Libraries From 63ff62a2ea8c92ff83ebab7b6afe9d9f327f2e71 Mon Sep 17 00:00:00 2001 From: Alexander Stefanov Date: Wed, 15 Feb 2023 23:39:48 +0000 Subject: [PATCH 24/32] version autoupdate [2.1.5.1] --- .abf.yml | 2 +- libjpeg-turbo.spec | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.abf.yml b/.abf.yml index 471d715..b93af24 100644 --- a/.abf.yml +++ b/.abf.yml @@ -1,2 +1,2 @@ sources: - libjpeg-turbo-2.1.4.tar.gz: 5a355c08caa326cef7c2a61e062edfe8dd02ac07 + libjpeg-turbo-2.1.5.1.tar.gz: 3ec9f6a19781a583285d93c2c4653f3dbe845fcc diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec index 67966ab..fd24628 100644 --- a/libjpeg-turbo.spec +++ b/libjpeg-turbo.spec @@ -12,7 +12,7 @@ Summary: A MMX/SSE2 accelerated library for manipulating JPEG image files Name: libjpeg-turbo Epoch: 1 -Version: 2.1.4 +Version: 2.1.5.1 Release: 1 License: wxWindows Library License Group: System/Libraries From c0ef24082445f368c4af25db46e84105e4bfdc09 Mon Sep 17 00:00:00 2001 From: Aleksandr Proklov Date: Wed, 6 Mar 2024 11:07:59 +0900 Subject: [PATCH 25/32] Update version to 3.0.1 --- .abf.yml | 2 +- libjpeg-turbo.spec | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.abf.yml b/.abf.yml index b93af24..e40fea7 100644 --- a/.abf.yml +++ b/.abf.yml @@ -1,2 +1,2 @@ sources: - libjpeg-turbo-2.1.5.1.tar.gz: 3ec9f6a19781a583285d93c2c4653f3dbe845fcc + libjpeg-turbo-3.0.1.tar.gz: 965299edf1bfff8a604f302c7a0d249309339236 diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec index fd24628..5f38e75 100644 --- a/libjpeg-turbo.spec +++ b/libjpeg-turbo.spec @@ -12,7 +12,7 @@ Summary: A MMX/SSE2 accelerated library for manipulating JPEG image files Name: libjpeg-turbo Epoch: 1 -Version: 2.1.5.1 +Version: 3.0.1 Release: 1 License: wxWindows Library License Group: System/Libraries From 9cfeeb5e4f9ba850b50073ab468708465b210a21 Mon Sep 17 00:00:00 2001 From: Giovanni Mariani Date: Sun, 10 Mar 2024 16:27:46 +0100 Subject: [PATCH 26/32] Updated to release 3.0.2 --- .abf.yml | 2 +- libjpeg-turbo.spec | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.abf.yml b/.abf.yml index e40fea7..a52b704 100644 --- a/.abf.yml +++ b/.abf.yml @@ -1,2 +1,2 @@ sources: - libjpeg-turbo-3.0.1.tar.gz: 965299edf1bfff8a604f302c7a0d249309339236 + libjpeg-turbo-3.0.2.tar.gz: b6c5d5081ced8502eb1e1e72f1f5cc2856ce90ee diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec index 5f38e75..6497ff7 100644 --- a/libjpeg-turbo.spec +++ b/libjpeg-turbo.spec @@ -12,10 +12,10 @@ Summary: A MMX/SSE2 accelerated library for manipulating JPEG image files Name: libjpeg-turbo Epoch: 1 -Version: 3.0.1 +Version: 3.0.2 Release: 1 License: wxWindows Library License -Group: System/Libraries +Group: Graphics Url: http://www.libjpeg-turbo.org Source0: https://sourceforge.net/projects/libjpeg-turbo/files/%{version}/%{name}-%{version}.tar.gz # These two allow automatic lossless rotation of JPEG images from a digital @@ -201,5 +201,5 @@ install -m755 exifautotran -D %{buildroot}%{_bindir}/exifautotran #(neoclust) Provide jpegint.h because it is needed by certain software install -m644 jpegint.h -D %{buildroot}%{_includedir}/jpegint.h -# cleanup +# Cleanup rm -rf %{buildroot}%{_docdir}/* From cdc13abac1cec10e31354a9834bf080b05766996 Mon Sep 17 00:00:00 2001 From: Alexander Stefanov Date: Sun, 31 Mar 2024 04:34:43 +0300 Subject: [PATCH 27/32] add e2k --- libjpeg-turbo-3.0.2-e2k.patch | 4897 +++++++++++++++++++++++++++++++++ libjpeg-turbo.spec | 10 +- 2 files changed, 4903 insertions(+), 4 deletions(-) create mode 100644 libjpeg-turbo-3.0.2-e2k.patch diff --git a/libjpeg-turbo-3.0.2-e2k.patch b/libjpeg-turbo-3.0.2-e2k.patch new file mode 100644 index 0000000..a80f1d3 --- /dev/null +++ b/libjpeg-turbo-3.0.2-e2k.patch @@ -0,0 +1,4897 @@ +From 5c6ff06bc9aec237e1ba222a3dde057cbfa81c9d Mon Sep 17 00:00:00 2001 +From: Ilya Kurdyukov +Date: Fri, 9 Feb 2024 09:57:55 +0700 +Subject: [PATCH] libjpeg-turbo-3.0.2 e2k support + +--- + CMakeLists.txt | 5 + + simd/CMakeLists.txt | 23 ++ + simd/e2k/jccolext-e2k.c | 213 +++++++++++ + simd/e2k/jccolor-e2k.c | 163 +++++++++ + simd/e2k/jchuff-e2k.c | 307 ++++++++++++++++ + simd/e2k/jcphuff-e2k.c | 145 ++++++++ + simd/e2k/jcsample-e2k.c | 203 +++++++++++ + simd/e2k/jcsample.h | 28 ++ + simd/e2k/jdcolext-e2k.c | 258 +++++++++++++ + simd/e2k/jdcolor-e2k.c | 289 +++++++++++++++ + simd/e2k/jdcoltab-e2k.c | 80 ++++ + simd/e2k/jdsample-e2k.c | 389 ++++++++++++++++++++ + simd/e2k/jfdctflt-e2k.c | 127 +++++++ + simd/e2k/jfdctfst-e2k.c | 145 ++++++++ + simd/e2k/jfdctint-e2k.c | 255 +++++++++++++ + simd/e2k/jidctflt-e2k.c | 215 +++++++++++ + simd/e2k/jidctfst-e2k.c | 187 ++++++++++ + simd/e2k/jidctint-e2k.c | 294 +++++++++++++++ + simd/e2k/jquantf-e2k.c | 121 +++++++ + simd/e2k/jquanti-e2k.c | 178 +++++++++ + simd/e2k/jsimd.c | 761 +++++++++++++++++++++++++++++++++++++++ + simd/e2k/jsimd_api_e2k.h | 94 +++++ + simd/e2k/jsimd_e2k.h | 207 +++++++++++ + 23 files changed, 4687 insertions(+) + create mode 100644 simd/e2k/jccolext-e2k.c + create mode 100644 simd/e2k/jccolor-e2k.c + create mode 100644 simd/e2k/jchuff-e2k.c + create mode 100644 simd/e2k/jcphuff-e2k.c + create mode 100644 simd/e2k/jcsample-e2k.c + create mode 100644 simd/e2k/jcsample.h + create mode 100644 simd/e2k/jdcolext-e2k.c + create mode 100644 simd/e2k/jdcolor-e2k.c + create mode 100644 simd/e2k/jdcoltab-e2k.c + create mode 100644 simd/e2k/jdsample-e2k.c + create mode 100644 simd/e2k/jfdctflt-e2k.c + create mode 100644 simd/e2k/jfdctfst-e2k.c + create mode 100644 simd/e2k/jfdctint-e2k.c + create mode 100644 simd/e2k/jidctflt-e2k.c + create mode 100644 simd/e2k/jidctfst-e2k.c + create mode 100644 simd/e2k/jidctint-e2k.c + create mode 100644 simd/e2k/jquantf-e2k.c + create mode 100644 simd/e2k/jquanti-e2k.c + create mode 100644 simd/e2k/jsimd.c + create mode 100644 simd/e2k/jsimd_api_e2k.h + create mode 100644 simd/e2k/jsimd_e2k.h + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index adb0ca4..3b445a0 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -125,6 +125,9 @@ elseif(CMAKE_SYSTEM_PROCESSOR_LC STREQUAL "aarch64" OR + elseif(CMAKE_SYSTEM_PROCESSOR_LC MATCHES "^ppc" OR + CMAKE_SYSTEM_PROCESSOR_LC MATCHES "^powerpc") + set(CPU_TYPE powerpc) ++elseif(CMAKE_SYSTEM_PROCESSOR_LC STREQUAL "e2k" OR ++ CMAKE_SYSTEM_PROCESSOR_LC STREQUAL "elbrus") ++ set(CPU_TYPE e2k) + else() + set(CPU_TYPE ${CMAKE_SYSTEM_PROCESSOR_LC}) + endif() +@@ -906,6 +909,8 @@ if(CPU_TYPE STREQUAL "x86_64" OR CPU_TYPE STREQUAL "i386") + elseif(CPU_TYPE STREQUAL "x86_64") + set(DEFAULT_FLOATTEST8 no-fp-contract) + endif() ++elseif(WITH_SIMD AND CPU_TYPE STREQUAL "e2k") ++ set(DEFAULT_FLOATTEST8 sse) + elseif(CPU_TYPE STREQUAL "powerpc" OR CPU_TYPE STREQUAL "arm64") + if(CMAKE_C_COMPILER_ID STREQUAL "Clang") + if(CMAKE_C_COMPILER_VERSION VERSION_EQUAL 14.0.0 OR +diff --git a/simd/CMakeLists.txt b/simd/CMakeLists.txt +index 6024900..789fa3f 100644 +--- a/simd/CMakeLists.txt ++++ b/simd/CMakeLists.txt +@@ -531,6 +531,29 @@ if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED) + set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1) + endif() + ++############################################################################### ++# Elbrus (Intrinsics) ++############################################################################### ++ ++elseif(CPU_TYPE STREQUAL "e2k") ++ ++set(SIMD_SOURCES e2k/jquanti-e2k.c e2k/jquantf-e2k.c ++ e2k/jccolor-e2k.c e2k/jcsample-e2k.c ++ e2k/jdcolor-e2k.c e2k/jdsample-e2k.c ++ e2k/jfdctint-e2k.c e2k/jfdctfst-e2k.c e2k/jfdctflt-e2k.c ++ e2k/jidctint-e2k.c e2k/jidctfst-e2k.c e2k/jidctflt-e2k.c ++ e2k/jchuff-e2k.c e2k/jcphuff-e2k.c) ++ ++set_source_files_properties(${SIMD_SOURCES} PROPERTIES ++ COMPILE_FLAGS -msse4.1) ++ ++set(SIMD_SOURCES ${SIMD_SOURCES} e2k/jsimd.c) ++ ++add_library(simd OBJECT ${SIMD_SOURCES}) ++ ++if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED) ++ set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1) ++endif() + + ############################################################################### + # None +diff --git a/simd/e2k/jccolext-e2k.c b/simd/e2k/jccolext-e2k.c +new file mode 100644 +index 0000000..49abdb4 +--- /dev/null ++++ b/simd/e2k/jccolext-e2k.c +@@ -0,0 +1,213 @@ ++/* ++ * Elbrus optimizations for libjpeg-turbo ++ * ++ * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. ++ * Copyright (C) 2014, Jay Foad. All Rights Reserved. ++ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd ++ * ++ * This software is provided 'as-is', without any express or implied ++ * warranty. In no event will the authors be held liable for any damages ++ * arising from the use of this software. ++ * ++ * Permission is granted to anyone to use this software for any purpose, ++ * including commercial applications, and to alter it and redistribute it ++ * freely, subject to the following restrictions: ++ * ++ * 1. The origin of this software must not be misrepresented; you must not ++ * claim that you wrote the original software. If you use this software ++ * in a product, an acknowledgment in the product documentation would be ++ * appreciated but is not required. ++ * 2. Altered source versions must be plainly marked as such, and must not be ++ * misrepresented as being the original software. ++ * 3. This notice may not be removed or altered from any source distribution. ++ */ ++ ++/* This file is included by jccolor-e2k.c */ ++ ++void rgbn_ycc_convert(JDIMENSION img_width, JSAMPARRAY input_buf, ++ JSAMPIMAGE output_buf, JDIMENSION output_row, ++ int num_rows, int shuf_idx) ++{ ++ JSAMPROW inptr, outptr0, outptr1, outptr2; ++ unsigned char __attribute__((aligned(16))) tmpbuf[PIXELSIZE * 16]; ++ ++ __m128i pb_zero = _mm_setzero_si128(); ++ __m128i pb_shuf0 = VEC_LD(rgb_ycc_shuf_const[shuf_idx]); ++#if PIXELSIZE == 4 ++ __m128i rgb3 = pb_zero; ++#else ++ __m128i pb_shuf4 = VEC_LD(rgb_ycc_shuf_const[shuf_idx] + 16); ++#endif ++ __m128i rgb0, rgb1 = pb_zero, rgb2 = pb_zero, ++ rgbg0, rgbg1, rgbg2, rgbg3, rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3; ++ __m128i y, yl, yh, y0, y1, y2, y3; ++ __m128i cb, cr, crl, crh, cbl, cbh; ++ __m128i cr0, cr1, cr2, cr3, cb0, cb1, cb2, cb3; ++ ++ /* Constants */ ++ __m128i pw_f0299_f0337 = _mm_setr_epi16(__4X2(F_0_299, F_0_337)), ++ pw_f0114_f0250 = _mm_setr_epi16(__4X2(F_0_114, F_0_250)), ++ pw_mf016_mf033 = _mm_setr_epi16(__4X2(-F_0_168, -F_0_331)), ++ pw_mf008_mf041 = _mm_setr_epi16(__4X2(-F_0_081, -F_0_418)), ++ pw_mf050_f000 = _mm_setr_epi16(__4X2(-F_0_500, 0)), ++ pd_onehalf = _mm_set1_epi32(ONE_HALF), ++ pd_onehalfm1_cj = _mm_set1_epi32(ONE_HALF - 1 + (CENTERJSAMPLE << SCALEBITS)); ++#ifdef NEED_ALIGN8 ++ ALIGN8_COMMON ++ ALIGN8_VARS(src) ++#endif ++ ++ if (img_width > 0) ++ while (--num_rows >= 0) { ++ int num_cols; ++ inptr = *input_buf++; ++ outptr0 = output_buf[0][output_row]; ++ outptr1 = output_buf[1][output_row]; ++ outptr2 = output_buf[2][output_row]; ++ output_row++; ++ ++ if (img_width >= 16) { ++#ifdef NEED_ALIGN8 ++ ALIGN8_START(inptr, src) ++ inptr += (img_width & -16) * PIXELSIZE; ++#endif ++ ++ PRAGMA_E2K("ivdep") ++ for (num_cols = img_width; num_cols >= 16; num_cols -= 16, ++ outptr0 += 16, outptr1 += 16, outptr2 += 16) { ++#ifdef NEED_ALIGN8 ++ ALIGN8_READ16(rgb0, src, 0) ++ ALIGN8_READ16(rgb1, src, 1) ++ ALIGN8_READ16(rgb2, src, 2) ++#if PIXELSIZE == 4 ++ ALIGN8_READ16(rgb3, src, 3) ++#endif ++ src_ptr += PIXELSIZE * 2; ++#else ++ rgb0 = VEC_LD(inptr); ++ rgb1 = VEC_LD(inptr + 16); ++ rgb2 = VEC_LD(inptr + 32); ++#if PIXELSIZE == 4 ++ rgb3 = VEC_LD(inptr + 48); ++#endif ++ inptr += PIXELSIZE * 16; ++#endif ++ RGB_SHUFFLE ++ CALC_Y(outptr0) ++ CALC_CC(outptr1, outptr2) ++ } ++ } ++ ++ num_cols = img_width & 15; ++ if (num_cols) { ++ int i; ++ memcpy(tmpbuf, inptr, num_cols * PIXELSIZE); ++ rgb0 = VEC_LD(tmpbuf); ++ rgb1 = VEC_LD(tmpbuf + 16); ++ rgb2 = VEC_LD(tmpbuf + 32); ++#if PIXELSIZE == 4 ++ rgb3 = VEC_LD(tmpbuf + 48); ++#endif ++ RGB_SHUFFLE ++ CALC_Y(tmpbuf) ++ CALC_CC(tmpbuf + 16, tmpbuf + 32) ++ ++ for (i = 0; i < num_cols; i++) { ++ outptr0[i] = tmpbuf[i]; ++ outptr1[i] = tmpbuf[i + 16]; ++ outptr2[i] = tmpbuf[i + 32]; ++ } ++ } ++ } ++} ++ ++void rgbn_gray_convert(JDIMENSION img_width, JSAMPARRAY input_buf, ++ JSAMPIMAGE output_buf, JDIMENSION output_row, ++ int num_rows, int shuf_idx) ++{ ++ JSAMPROW inptr, outptr; ++ uint8_t __attribute__((aligned(16))) tmpbuf[PIXELSIZE * 16]; ++ ++ __m128i pb_zero = _mm_setzero_si128(); ++ __m128i pb_shuf0 = VEC_LD(rgb_ycc_shuf_const[shuf_idx]); ++#if PIXELSIZE == 4 ++ __m128i rgb3 = pb_zero; ++#else ++ __m128i pb_shuf4 = VEC_LD(rgb_ycc_shuf_const[shuf_idx] + 16); ++#endif ++ __m128i rgb0, rgb1 = pb_zero, rgb2 = pb_zero, ++ rgbg0, rgbg1, rgbg2, rgbg3, rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3; ++ __m128i y, yl, yh, y0, y1, y2, y3; ++ ++ /* Constants */ ++ __m128i pw_f0299_f0337 = _mm_setr_epi16(__4X2(F_0_299, F_0_337)), ++ pw_f0114_f0250 = _mm_setr_epi16(__4X2(F_0_114, F_0_250)), ++ pd_onehalf = _mm_set1_epi32(ONE_HALF); ++#ifdef NEED_ALIGN8 ++ ALIGN8_COMMON ++ ALIGN8_VARS(src) ++#endif ++ ++ if (img_width > 0) ++ while (--num_rows >= 0) { ++ int num_cols; ++ inptr = *input_buf++; ++ outptr = output_buf[0][output_row]; ++ output_row++; ++ ++ if (img_width >= 16) { ++#ifdef NEED_ALIGN8 ++ ALIGN8_START(inptr, src) ++ inptr += (img_width & -16) * PIXELSIZE; ++#endif ++ ++ PRAGMA_E2K("ivdep") ++ for (num_cols = img_width; num_cols >= 16; num_cols -= 16, ++ outptr += 16) { ++#ifdef NEED_ALIGN8 ++ ALIGN8_READ16(rgb0, src, 0) ++ ALIGN8_READ16(rgb1, src, 1) ++ ALIGN8_READ16(rgb2, src, 2) ++#if PIXELSIZE == 4 ++ ALIGN8_READ16(rgb3, src, 3) ++#endif ++ src_ptr += PIXELSIZE * 2; ++#else ++ rgb0 = VEC_LD(inptr); ++ rgb1 = VEC_LD(inptr + 16); ++ rgb2 = VEC_LD(inptr + 32); ++#if PIXELSIZE == 4 ++ rgb3 = VEC_LD(inptr + 48); ++#endif ++ inptr += PIXELSIZE * 16; ++#endif ++ RGB_SHUFFLE ++ CALC_Y(outptr) ++ } ++ } ++ ++ num_cols = img_width & 15; ++ if (num_cols) { ++ int i; ++ memcpy(tmpbuf, inptr, num_cols * PIXELSIZE); ++ rgb0 = VEC_LD(tmpbuf); ++ rgb1 = VEC_LD(tmpbuf + 16); ++ rgb2 = VEC_LD(tmpbuf + 32); ++#if PIXELSIZE == 4 ++ rgb3 = VEC_LD(tmpbuf + 48); ++#endif ++ RGB_SHUFFLE ++ CALC_Y(tmpbuf) ++ ++ for (i = 0; i < num_cols; i++) { ++ outptr[i] = tmpbuf[i]; ++ } ++ } ++ } ++} ++ ++#undef RGB_SHUFFLE ++#undef PIXELSIZE ++#undef rgbn_ycc_convert ++#undef rgbn_gray_convert ++ +diff --git a/simd/e2k/jccolor-e2k.c b/simd/e2k/jccolor-e2k.c +new file mode 100644 +index 0000000..0af2626 +--- /dev/null ++++ b/simd/e2k/jccolor-e2k.c +@@ -0,0 +1,163 @@ ++/* ++ * Elbrus optimizations for libjpeg-turbo ++ * ++ * Copyright (C) 2014, D. R. Commander. All Rights Reserved. ++ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd ++ * ++ * This software is provided 'as-is', without any express or implied ++ * warranty. In no event will the authors be held liable for any damages ++ * arising from the use of this software. ++ * ++ * Permission is granted to anyone to use this software for any purpose, ++ * including commercial applications, and to alter it and redistribute it ++ * freely, subject to the following restrictions: ++ * ++ * 1. The origin of this software must not be misrepresented; you must not ++ * claim that you wrote the original software. If you use this software ++ * in a product, an acknowledgment in the product documentation would be ++ * appreciated but is not required. ++ * 2. Altered source versions must be plainly marked as such, and must not be ++ * misrepresented as being the original software. ++ * 3. This notice may not be removed or altered from any source distribution. ++ */ ++ ++/* RGB --> YCC CONVERSION */ ++ ++#include "jsimd_e2k.h" ++ ++#define F_0_081 5329 /* FIX(0.08131) */ ++#define F_0_114 7471 /* FIX(0.11400) */ ++#define F_0_168 11059 /* FIX(0.16874) */ ++#define F_0_250 16384 /* FIX(0.25000) */ ++#define F_0_299 19595 /* FIX(0.29900) */ ++#define F_0_331 21709 /* FIX(0.33126) */ ++#define F_0_418 27439 /* FIX(0.41869) */ ++#define F_0_500 32768 /* FIX(0.50000) */ ++#define F_0_587 38470 /* FIX(0.58700) */ ++#define F_0_337 (F_0_587 - F_0_250) /* FIX(0.58700) - FIX(0.25000) */ ++#define F_0_413 (65536 - F_0_587) /* FIX(1.00000) - FIX(0.58700) */ ++ ++#define SCALEBITS 16 ++#define ONE_HALF (1 << (SCALEBITS - 1)) ++ ++#define RGBG_INDEX_(name, color, i, x) \ ++ name##_##color + i * name##_PIXELSIZE + x, \ ++ name##_GREEN + i * name##_PIXELSIZE + x ++#define RGBG_INDEX(name, x) \ ++ RGBG_INDEX_(name, RED, 0, x), RGBG_INDEX_(name, RED, 1, x), \ ++ RGBG_INDEX_(name, RED, 2, x), RGBG_INDEX_(name, RED, 3, x), \ ++ RGBG_INDEX_(name, BLUE, 0, x), RGBG_INDEX_(name, BLUE, 1, x), \ ++ RGBG_INDEX_(name, BLUE, 2, x), RGBG_INDEX_(name, BLUE, 3, x) ++ ++static const uint8_t __attribute__((aligned(16))) ++rgb_ycc_shuf_const[7][32] = { ++ { RGBG_INDEX(RGB, 0), RGBG_INDEX(RGB, 4) }, ++ { RGBG_INDEX(EXT_RGB, 0), RGBG_INDEX(EXT_RGB, 4) }, ++ { RGBG_INDEX(EXT_RGBX, 0), RGBG_INDEX(EXT_RGBX, 4) }, ++ { RGBG_INDEX(EXT_BGR, 0), RGBG_INDEX(EXT_BGR, 4) }, ++ { RGBG_INDEX(EXT_BGRX, 0), RGBG_INDEX(EXT_BGRX, 4) }, ++ { RGBG_INDEX(EXT_XBGR, 0), RGBG_INDEX(EXT_XBGR, 4) }, ++ { RGBG_INDEX(EXT_XRGB, 0), RGBG_INDEX(EXT_XRGB, 4) } ++}; ++ ++ /* rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3 ++ * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7 ++ * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb ++ * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf ++ * ++ * rg0 = R0 G0 R1 G1 R2 G2 R3 G3 ++ * bg0 = B0 G0 B1 G1 B2 G2 B3 G3 ++ * ... ++ */ ++ ++ /* (Original) ++ * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B ++ * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE ++ * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE ++ * ++ * (This implementation) ++ * Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G ++ * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE ++ * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE ++ */ ++ ++#define CALC_Y(outptr0) \ ++ rg0 = _mm_unpacklo_epi8(rgbg0, pb_zero); \ ++ bg0 = _mm_unpackhi_epi8(rgbg0, pb_zero); \ ++ rg1 = _mm_unpacklo_epi8(rgbg1, pb_zero); \ ++ bg1 = _mm_unpackhi_epi8(rgbg1, pb_zero); \ ++ rg2 = _mm_unpacklo_epi8(rgbg2, pb_zero); \ ++ bg2 = _mm_unpackhi_epi8(rgbg2, pb_zero); \ ++ rg3 = _mm_unpacklo_epi8(rgbg3, pb_zero); \ ++ bg3 = _mm_unpackhi_epi8(rgbg3, pb_zero); \ ++ \ ++ /* Calculate Y values */ \ ++ y0 = _mm_add_epi32(_mm_madd_epi16(rg0, pw_f0299_f0337), pd_onehalf); \ ++ y1 = _mm_add_epi32(_mm_madd_epi16(rg1, pw_f0299_f0337), pd_onehalf); \ ++ y2 = _mm_add_epi32(_mm_madd_epi16(rg2, pw_f0299_f0337), pd_onehalf); \ ++ y3 = _mm_add_epi32(_mm_madd_epi16(rg3, pw_f0299_f0337), pd_onehalf); \ ++ y0 = _mm_add_epi32(_mm_madd_epi16(bg0, pw_f0114_f0250), y0); \ ++ y1 = _mm_add_epi32(_mm_madd_epi16(bg1, pw_f0114_f0250), y1); \ ++ y2 = _mm_add_epi32(_mm_madd_epi16(bg2, pw_f0114_f0250), y2); \ ++ y3 = _mm_add_epi32(_mm_madd_epi16(bg3, pw_f0114_f0250), y3); \ ++ \ ++ yl = _mm_packhi_epi32(y0, y1); \ ++ yh = _mm_packhi_epi32(y2, y3); \ ++ y = _mm_packus_epi16(yl, yh); \ ++ VEC_ST(outptr0, y); ++ ++#define CALC_CC(outptr1, outptr2) \ ++ /* Calculate Cb values */ \ ++ cb0 = _mm_add_epi32(_mm_madd_epi16(rg0, pw_mf016_mf033), pd_onehalfm1_cj); \ ++ cb1 = _mm_add_epi32(_mm_madd_epi16(rg1, pw_mf016_mf033), pd_onehalfm1_cj); \ ++ cb2 = _mm_add_epi32(_mm_madd_epi16(rg2, pw_mf016_mf033), pd_onehalfm1_cj); \ ++ cb3 = _mm_add_epi32(_mm_madd_epi16(rg3, pw_mf016_mf033), pd_onehalfm1_cj); \ ++ cb0 = _mm_sub_epi32(cb0, _mm_madd_epi16(bg0, pw_mf050_f000)); \ ++ cb1 = _mm_sub_epi32(cb1, _mm_madd_epi16(bg1, pw_mf050_f000)); \ ++ cb2 = _mm_sub_epi32(cb2, _mm_madd_epi16(bg2, pw_mf050_f000)); \ ++ cb3 = _mm_sub_epi32(cb3, _mm_madd_epi16(bg3, pw_mf050_f000)); \ ++ \ ++ cbl = _mm_packhi_epi32(cb0, cb1); \ ++ cbh = _mm_packhi_epi32(cb2, cb3); \ ++ cb = _mm_packus_epi16(cbl, cbh); \ ++ VEC_ST(outptr1, cb); \ ++ \ ++ /* Calculate Cr values */ \ ++ cr0 = _mm_add_epi32(_mm_madd_epi16(bg0, pw_mf008_mf041), pd_onehalfm1_cj); \ ++ cr1 = _mm_add_epi32(_mm_madd_epi16(bg1, pw_mf008_mf041), pd_onehalfm1_cj); \ ++ cr2 = _mm_add_epi32(_mm_madd_epi16(bg2, pw_mf008_mf041), pd_onehalfm1_cj); \ ++ cr3 = _mm_add_epi32(_mm_madd_epi16(bg3, pw_mf008_mf041), pd_onehalfm1_cj); \ ++ cr0 = _mm_sub_epi32(cr0, _mm_madd_epi16(rg0, pw_mf050_f000)); \ ++ cr1 = _mm_sub_epi32(cr1, _mm_madd_epi16(rg1, pw_mf050_f000)); \ ++ cr2 = _mm_sub_epi32(cr2, _mm_madd_epi16(rg2, pw_mf050_f000)); \ ++ cr3 = _mm_sub_epi32(cr3, _mm_madd_epi16(rg3, pw_mf050_f000)); \ ++ \ ++ crl = _mm_packhi_epi32(cr0, cr1); \ ++ crh = _mm_packhi_epi32(cr2, cr3); \ ++ cr = _mm_packus_epi16(crl, crh); \ ++ VEC_ST(outptr2, cr); ++ ++ ++#define PIXELSIZE 3 ++#define RGB_SHUFFLE \ ++ rgbg0 = _mm_shuffle_epi8(rgb0, pb_shuf0); \ ++ rgbg1 = _mm_shuffle_epi8(VEC_ALIGNR8(rgb1, rgb0), pb_shuf4); \ ++ rgbg2 = _mm_shuffle_epi8(VEC_ALIGNR8(rgb2, rgb1), pb_shuf0); \ ++ rgbg3 = _mm_shuffle_epi8(rgb2, pb_shuf4); ++ ++#define rgbn_ycc_convert jsimd_rgb3_ycc_convert_e2k ++#define rgbn_gray_convert jsimd_rgb3_gray_convert_e2k ++#include "jccolext-e2k.c" ++ ++ ++#define PIXELSIZE 4 ++#define RGB_SHUFFLE \ ++ rgbg0 = _mm_shuffle_epi8(rgb0, pb_shuf0); \ ++ rgbg1 = _mm_shuffle_epi8(rgb1, pb_shuf0); \ ++ rgbg2 = _mm_shuffle_epi8(rgb2, pb_shuf0); \ ++ rgbg3 = _mm_shuffle_epi8(rgb3, pb_shuf0); ++ ++#define rgbn_ycc_convert jsimd_rgb4_ycc_convert_e2k ++#define rgbn_gray_convert jsimd_rgb4_gray_convert_e2k ++#include "jccolext-e2k.c" ++ +diff --git a/simd/e2k/jchuff-e2k.c b/simd/e2k/jchuff-e2k.c +new file mode 100644 +index 0000000..ec4329e +--- /dev/null ++++ b/simd/e2k/jchuff-e2k.c +@@ -0,0 +1,307 @@ ++/* ++ * Elbrus optimizations for libjpeg-turbo ++ * ++ * Copyright (C) 2022, Ilya Kurdyukov for BaseALT, Ltd ++ * ++ * This software is provided 'as-is', without any express or implied ++ * warranty. In no event will the authors be held liable for any damages ++ * arising from the use of this software. ++ * ++ * Permission is granted to anyone to use this software for any purpose, ++ * including commercial applications, and to alter it and redistribute it ++ * freely, subject to the following restrictions: ++ * ++ * 1. The origin of this software must not be misrepresented; you must not ++ * claim that you wrote the original software. If you use this software ++ * in a product, an acknowledgment in the product documentation would be ++ * appreciated but is not required. ++ * 2. Altered source versions must be plainly marked as such, and must not be ++ * misrepresented as being the original software. ++ * 3. This notice may not be removed or altered from any source distribution. ++ * ++ * NOTE: All referenced figures are from ++ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994. ++ */ ++ ++/* Encode a single block's worth of coefficients */ ++ ++#include "jsimd_e2k.h" ++ ++#if __SIZEOF_SIZE_T__ != 8 ++#error ++#endif ++ ++typedef unsigned long long bit_buf_type; ++#define BIT_BUF_SIZE 64 ++ ++typedef struct { ++ bit_buf_type put_buffer; /* current bit accumulation buffer */ ++ int free_bits; /* # of bits available in it */ ++ int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */ ++} savable_state; ++ ++typedef struct { ++ JOCTET *next_output_byte; /* => next byte to write in buffer */ ++ size_t free_in_buffer; /* # of byte spaces remaining in buffer */ ++ savable_state cur; /* Current bit buffer & DC state */ ++ j_compress_ptr cinfo; /* dump_buffer needs access to this */ ++ int simd; ++} working_state; ++ ++#define EMIT_BYTE(b) { \ ++ buffer[0] = (JOCTET)(b); \ ++ buffer[1] = 0; \ ++ buffer -= -2 + ((JOCTET)(b) < 0xFF); \ ++} ++ ++#define FLUSH() { \ ++ if (put_buffer & 0x8080808080808080 & ~(put_buffer + 0x0101010101010101)) { \ ++ EMIT_BYTE(put_buffer >> 56) \ ++ EMIT_BYTE(put_buffer >> 48) \ ++ EMIT_BYTE(put_buffer >> 40) \ ++ EMIT_BYTE(put_buffer >> 32) \ ++ EMIT_BYTE(put_buffer >> 24) \ ++ EMIT_BYTE(put_buffer >> 16) \ ++ EMIT_BYTE(put_buffer >> 8) \ ++ EMIT_BYTE(put_buffer ) \ ++ } else { \ ++ *(uint64_t*)buffer = __builtin_bswap64(put_buffer); \ ++ buffer += 8; \ ++ } \ ++} ++ ++#define PUT_AND_FLUSH(code, size) { \ ++ put_buffer = (put_buffer << (size + free_bits)) | (code >> -free_bits); \ ++ FLUSH() \ ++ free_bits += BIT_BUF_SIZE; \ ++ put_buffer = code; \ ++} ++ ++#define PUT_BITS(code, size) { \ ++ free_bits -= size; \ ++ if (free_bits < 0) \ ++ PUT_AND_FLUSH(code, size) \ ++ else \ ++ put_buffer = (put_buffer << size) | code; \ ++} ++ ++#define PUT_CODE(code, size) { \ ++ /* temp &= (((JLONG)1) << nbits) - 1; */ \ ++ /* temp |= code << nbits; */ \ ++ temp = __builtin_e2k_insfd(code, __builtin_e2k_insfd(nbits, 6 * 63 + 64, -nbits), temp); \ ++ nbits += size; \ ++ PUT_BITS(temp, nbits) \ ++} ++ ++#define KLOOP_PREPARE(mask, i) \ ++ t0 = _mm_cmpeq_epi8(_mm_packs_epi16(v0, v1), zero); \ ++ t1 = _mm_cmpeq_epi8(_mm_packs_epi16(v2, v3), zero); \ ++ mask = (uint32_t)(_mm_movemask_epi8(t0) | _mm_movemask_epi8(t1) << 16); \ ++ t0 = _mm_add_epi16(_mm_srai_epi16(v0, 15), v0); \ ++ t1 = _mm_add_epi16(_mm_srai_epi16(v1, 15), v1); \ ++ t2 = _mm_add_epi16(_mm_srai_epi16(v2, 15), v2); \ ++ t3 = _mm_add_epi16(_mm_srai_epi16(v3, 15), v3); \ ++ v0 = _mm_abs_epi16(v0); \ ++ v1 = _mm_abs_epi16(v1); \ ++ v2 = _mm_abs_epi16(v2); \ ++ v3 = _mm_abs_epi16(v3); \ ++ VEC_ST(block_nbits + i, v0); \ ++ VEC_ST(block_nbits + i + 8, v1); \ ++ VEC_ST(block_nbits + i + 16, v2); \ ++ VEC_ST(block_nbits + i + 24, v3); \ ++ VEC_ST(block_diff + i, t0); \ ++ VEC_ST(block_diff + i + 8, t1); \ ++ VEC_ST(block_diff + i + 16, t2); \ ++ VEC_ST(block_diff + i + 24, t3); ++ ++#define SHUF16X4(a, b, c, d) _mm_setr_pi8( \ ++ a * 2, a * 2 + 1, b * 2, b * 2 + 1, c * 2, c * 2 + 1, d * 2, d * 2 + 1) ++#define VEC_COMBINE(h0, h1) _mm_unpacklo_epi64( \ ++ _mm_movpi64_epi64(h0), _mm_movpi64_epi64(h1)) ++#define INSFI_M64(a, b, c, d) _mm_cvtsi64_m64(__builtin_e2k_insfd( \ ++ _mm_cvtm64_si64(a), (b & 63) | (d & 63) << 6, _mm_cvtm64_si64(c))) ++ ++GLOBAL(JOCTET *) ++jsimd_huff_encode_one_block_e2k(void *state, JOCTET *buffer, ++ JCOEFPTR block, int last_dc_val, ++ c_derived_tbl *dctbl, c_derived_tbl *actbl) { ++ uint64_t temp, nbits; ++ uint64_t i, r, code, size; ++ uint64_t code_0xf0 = actbl->ehufco[0xf0]; ++ uint64_t size_0xf0 = actbl->ehufsi[0xf0]; ++ ++ working_state *state_ptr = (working_state*)state; ++ bit_buf_type put_buffer = state_ptr->cur.put_buffer; ++ int64_t free_bits = state_ptr->cur.free_bits; ++ ++ __m128i zero = _mm_setzero_si128(); ++ __m128i v0, v1, v2, v3, t0, t1, t2, t3; ++ int64_t mask, mask1; ++ uint16_t __attribute__((aligned(16))) block_nbits[DCTSIZE2]; ++ int16_t __attribute__((aligned(16))) block_diff[DCTSIZE2]; ++ ++#if 1 /* faster this way */ ++ { ++ __m64 d0l, d0h, d1l, d1h, d2l, d2h, d3l, d3h; ++ __m64 d4l, d4h, d5l, d5h, d6l, d6h, d7l, d7h; ++ __m64 h0, h1, h2, h3, r0, r1, c1256 = SHUF16X4(1, 2, 5, 6); ++ ++ d0l = *(__m64*)(block + 8 * 0); d0h = *(__m64*)(block + 8 * 0 + 4); // 0 4 ++ d1l = *(__m64*)(block + 8 * 1); d1h = *(__m64*)(block + 8 * 1 + 4); // 8 12 ++ d2l = *(__m64*)(block + 8 * 2); d2h = *(__m64*)(block + 8 * 2 + 4); // 16 20 ++ d3l = *(__m64*)(block + 8 * 3); d3h = *(__m64*)(block + 8 * 3 + 4); // 24 28 ++ d4l = *(__m64*)(block + 8 * 4); d4h = *(__m64*)(block + 8 * 4 + 4); // 32 36 ++ d5l = *(__m64*)(block + 8 * 5); d5h = *(__m64*)(block + 8 * 5 + 4); // 40 44 ++ d6l = *(__m64*)(block + 8 * 6); d6h = *(__m64*)(block + 8 * 6 + 4); // 48 52 ++ d7l = *(__m64*)(block + 8 * 7); d7h = *(__m64*)(block + 8 * 7 + 4); // 56 60 ++ ++ // d0l[0] d0l[1] d1l[0] d2l[0] ++ // d1l[1] d0l[2] d0l[3] d1l[2] ++ h0 = _mm_unpacklo_pi16(d1l, d2l); ++ r0 = _mm_unpacklo_pi32(d0l, h0); ++ r1 = _mm_shuffle2_pi8(d1l, d0l, SHUF16X4(1, 6, 7, 2)); ++ r0 = _mm_sub_pi16(r0, _mm_cvtsi64_m64((uint16_t)last_dc_val)); ++ v0 = VEC_COMBINE(r0, r1); ++ ++ // d2l[1] d3l[0] d4l[0] d3l[1] ++ // d2l[2] d1l[3] d0h[0] d0h[1] ++ h0 = _mm_srli_si64(_mm_unpacklo_pi32(d2l, d4l), 16); ++ h2 = INSFI_M64(d1l, 0, d2l, 48); ++ r0 = _mm_unpacklo_pi16(h0, d3l); ++ r1 = _mm_alignr_pi8(d0h, h2, 4); ++ v1 = VEC_COMBINE(r0, r1); ++ ++ // d1h[0] d2l[3] d3l[2] d4l[1] ++ // d5l[0] d6l[0] d5l[1] d4l[2] ++ h0 = INSFI_M64(d2l, 32, d1h, 16); ++ h1 = INSFI_M64(d4l, -32, d3l, 48); ++ h2 = INSFI_M64(d4l, 16, d6l, 16); ++ r0 = INSFI_M64(h1, 0, h0, 32); ++ r1 = _mm_unpacklo_pi16(d5l, h2); ++ v2 = VEC_COMBINE(r0, r1); ++ ++ // d3l[3] d2h[0] d1h[1] d0h[2] ++ // d0h[3] d1h[2] d2h[1] d3h[0] ++ h0 = _mm_alignr_pi8(d2h, d3l, 6); ++ h1 = INSFI_M64(d0h, 0, d1h, 32); ++ h2 = _mm_unpackhi_pi32(d0h, d1h); ++ h3 = _mm_unpacklo_pi32(d2h, d3h); ++ r0 = INSFI_M64(h1, -16, h0, 32); ++ r1 = _mm_shuffle2_pi8(h2, h3, c1256); ++ v3 = VEC_COMBINE(r0, r1); ++ ++ KLOOP_PREPARE(mask, 0) ++ ++ // d4l[3] d5l[2] d6l[1] d7l[0] ++ // d7l[1] d6l[2] d5l[3] d4h[0] ++ h0 = _mm_unpackhi_pi32(d4l, d5l); ++ h1 = _mm_unpacklo_pi32(d6l, d7l); ++ h2 = INSFI_M64(d6l, 0, d7l, 32); ++ h2 = INSFI_M64(d5l, 0, h2, 48); ++ r0 = _mm_shuffle2_pi8(h0, h1, c1256); ++ r1 = _mm_alignr_pi8(d4h, h2, 2); ++ v0 = VEC_COMBINE(r0, r1); ++ ++ // d3h[1] d2h[2] d1h[3] d2h[3] ++ // d3h[2] d4h[1] d5h[0] d6l[3] ++ h0 = _mm_slli_si64(INSFI_M64(d1h, 16, d3h, 32), 16); ++ h2 = INSFI_M64(d4h, -32, d3h, 48); ++ h3 = INSFI_M64(d6l, 32, d5h, 16); ++ r0 = _mm_unpackhi_pi16(h0, d2h); ++ r1 = _mm_alignr_pi8(h3, h2, 4); ++ v1 = VEC_COMBINE(r0, r1); ++ ++ // d7l[2] d7l[3] d6h[0] d5h[1] ++ // d4h[2] d3h[3] d4h[3] d5h[2] ++ h0 = INSFI_M64(d5h, 0, d6h, 16); ++ h2 = _mm_slli_si64(_mm_unpackhi_pi32(d3h, d5h), 16); ++ r0 = _mm_alignr_pi8(h0, d7l, 4); ++ r1 = _mm_unpackhi_pi16(d4h, h2); ++ v2 = VEC_COMBINE(r0, r1); ++ ++ // d6h[1] d7h[0] d7h[1] d6h[2] ++ // d5h[3] d6h[3] d7h[2] d7h[3] ++ h0 = INSFI_M64(d6h, -16, d7h, 32); ++ h2 = _mm_unpackhi_pi16(d5h, d6h); ++ r0 = _mm_shuffle_pi16(h0, 0xd2); ++ r1 = _mm_unpackhi_pi32(h2, d7h); ++ v3 = VEC_COMBINE(r0, r1); ++ } ++#else ++ v0 = _mm_setr_epi16( ++ block[0] - last_dc_val, block[1], block[8], block[16], ++ block[9], block[2], block[3], block[10]); ++ v1 = _mm_setr_epi16( ++ block[17], block[24], block[32], block[25], ++ block[18], block[11], block[4], block[5]); ++ v2 = _mm_setr_epi16( ++ block[12], block[19], block[26], block[33], ++ block[40], block[48], block[41], block[34]); ++ v3 = _mm_setr_epi16( ++ block[27], block[20], block[13], block[6], ++ block[7], block[14], block[21], block[28]); ++ ++ KLOOP_PREPARE(mask, 0) ++ ++ v0 = _mm_setr_epi16( ++ block[35], block[42], block[49], block[56], ++ block[57], block[50], block[43], block[36]); ++ v1 = _mm_setr_epi16( ++ block[29], block[22], block[15], block[23], ++ block[30], block[37], block[44], block[51]); ++ v2 = _mm_setr_epi16( ++ block[58], block[59], block[52], block[45], ++ block[38], block[31], block[39], block[46]); ++ v3 = _mm_setr_epi16( ++ block[53], block[60], block[61], block[54], ++ block[47], block[55], block[62], block[63]); ++#endif ++ ++ KLOOP_PREPARE(mask1, 32) ++ mask |= mask1 << 32; ++ mask = ~mask; ++ ++ /* Encode the DC coefficient difference per section F.1.2.1 */ ++ ++ nbits = block_nbits[0]; ++ temp = block_diff[0]; ++ nbits = nbits ? 32 - __builtin_clz(nbits) : 0; ++ ++ /* Emit the Huffman-coded symbol for the number of bits */ ++ code = dctbl->ehufco[nbits]; ++ size = dctbl->ehufsi[nbits]; ++ PUT_CODE(code, size) ++ ++ /* Encode the AC coefficients per section F.1.2.2 */ ++ ++ /* e2k doesn't have a tzcnt instruction */ ++ mask = __builtin_e2k_bitrevd(mask) << 1; ++ ++ for (i = 1; mask; i++, mask <<= 1) { ++ r = __builtin_clzll(mask); ++ mask <<= r; ++ i += r; ++ /* if run length > 15, must emit special run-length-16 codes (0xF0) */ ++ while (r > 15) { ++ PUT_BITS(code_0xf0, size_0xf0) ++ r -= 16; ++ } ++ nbits = block_nbits[i]; ++ temp = block_diff[i]; ++ nbits = 32 - __builtin_clz(nbits); ++ /* Emit Huffman symbol for run length / number of bits */ ++ /* r = r << 4 | nbits; */ ++ r = __builtin_e2k_insfd(r, 4 * 63 + 64, nbits); ++ code = actbl->ehufco[r]; ++ size = actbl->ehufsi[r]; ++ PUT_CODE(code, size) ++ } ++ ++ if (i != 64) { ++ PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0]) ++ } ++ ++ state_ptr->cur.put_buffer = put_buffer; ++ state_ptr->cur.free_bits = free_bits; ++ return buffer; ++} +diff --git a/simd/e2k/jcphuff-e2k.c b/simd/e2k/jcphuff-e2k.c +new file mode 100644 +index 0000000..f69afeb +--- /dev/null ++++ b/simd/e2k/jcphuff-e2k.c +@@ -0,0 +1,145 @@ ++/* ++ * Elbrus optimizations for libjpeg-turbo ++ * ++ * Copyright (C) 2022, Ilya Kurdyukov for BaseALT, Ltd ++ * ++ * This software is provided 'as-is', without any express or implied ++ * warranty. In no event will the authors be held liable for any damages ++ * arising from the use of this software. ++ * ++ * Permission is granted to anyone to use this software for any purpose, ++ * including commercial applications, and to alter it and redistribute it ++ * freely, subject to the following restrictions: ++ * ++ * 1. The origin of this software must not be misrepresented; you must not ++ * claim that you wrote the original software. If you use this software ++ * in a product, an acknowledgment in the product documentation would be ++ * appreciated but is not required. ++ * 2. Altered source versions must be plainly marked as such, and must not be ++ * misrepresented as being the original software. ++ * 3. This notice may not be removed or altered from any source distribution. ++ */ ++ ++#include "jsimd_e2k.h" ++ ++#define X(i) coefs[i] = block[jpeg_natural_order_start[i]]; ++#define Y(i) coefs[i] = i < rem ? block[jpeg_natural_order_start[i]] : 0; ++ ++#define LOOP \ ++ for (i = 0; i < Sl >> 4; i++) { \ ++ X(0) X(1) X(2) X(3) X(4) X(5) X(6) X(7) \ ++ X(8) X(9) X(10) X(11) X(12) X(13) X(14) X(15) \ ++ BLOCK16 \ ++ jpeg_natural_order_start += 16; \ ++ } \ ++ rem = Sl & 15; \ ++ if (Sl & 8) { \ ++ X(0) X(1) X(2) X(3) X(4) X(5) X(6) X(7) \ ++ Y(8) Y(9) Y(10) Y(11) Y(12) Y(13) Y(14) \ ++ coefs[15] = 0; \ ++ BLOCK16 \ ++ } else if (rem > 0) { \ ++ Y(0) Y(1) Y(2) Y(3) Y(4) Y(5) Y(6) Y(7) \ ++ BLOCK8 \ ++ } ++ ++void jsimd_encode_mcu_AC_first_prepare_e2k ++ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, ++ JCOEF *values, size_t *zerobits) ++{ ++ JCOEF *diff = values + DCTSIZE2; ++ int16_t __attribute__((aligned(16))) coefs[16]; ++ __m128i v0, v1, v2, v3; ++ __m128i c0 = _mm_setzero_si128(), shr = _mm_cvtsi32_si128(Al); ++ int i, rem; ++ ++#define BLOCK16 \ ++ v0 = _mm_load_si128((__m128i*)coefs); \ ++ v1 = _mm_load_si128((__m128i*)coefs + 1); \ ++ v2 = _mm_srai_epi16(v0, 15); \ ++ v3 = _mm_srai_epi16(v1, 15); \ ++ v0 = _mm_sra_epi16(_mm_abs_epi16(v0), shr); \ ++ v1 = _mm_sra_epi16(_mm_abs_epi16(v1), shr); \ ++ v2 = _mm_xor_si128(v0, v2); \ ++ v3 = _mm_xor_si128(v1, v3); \ ++ _mm_store_si128((__m128i*)values, v0); \ ++ _mm_store_si128((__m128i*)values + 1, v1); \ ++ _mm_store_si128((__m128i*)diff, v2); \ ++ _mm_store_si128((__m128i*)diff + 1, v3); \ ++ v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, v1), c0); \ ++ ((uint16_t*)zerobits)[i] = ~_mm_movemask_epi8(v2); \ ++ values += 16; diff += 16; ++ ++#define BLOCK8 \ ++ v0 = _mm_load_si128((__m128i*)coefs); \ ++ v2 = _mm_srai_epi16(v0, 15); \ ++ v0 = _mm_sra_epi16(_mm_abs_epi16(v0), shr); \ ++ v2 = _mm_xor_si128(v0, v2); \ ++ _mm_store_si128((__m128i*)values, v0); \ ++ _mm_store_si128((__m128i*)diff, v2); \ ++ v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, c0), c0); \ ++ ((uint16_t*)zerobits)[i] = ~_mm_movemask_epi8(v2); \ ++ values += 8; diff += 8; ++ ++ ((uint64_t*)zerobits)[0] = 0; ++ LOOP ++#undef BLOCK16 ++#undef BLOCK8 ++ ++ for (i = (64 - Sl) >> 3; i; i--) { ++ _mm_store_si128((__m128i*)values, c0); ++ _mm_store_si128((__m128i*)diff, c0); ++ values += 8; diff += 8; ++ } ++} ++ ++int jsimd_encode_mcu_AC_refine_prepare_e2k ++ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, ++ JCOEF *absvalues, size_t *bits) ++{ ++ union { uint64_t q; uint16_t w[4]; } mask1 = { 0 }; ++ int16_t __attribute__((aligned(16))) coefs[16]; ++ __m128i v0, v1, v2, c1 = _mm_set1_epi8(1); ++ __m128i c0 = _mm_setzero_si128(), shr = _mm_cvtsi32_si128(Al); ++ int i, rem; ++ ++#define BLOCK16 \ ++ v0 = _mm_load_si128((__m128i*)coefs); \ ++ v1 = _mm_load_si128((__m128i*)coefs + 1); \ ++ v2 = _mm_packs_epi16(v0, v1); \ ++ ((uint16_t*)bits)[4 + i] = ~_mm_movemask_epi8(v2); \ ++ v0 = _mm_sra_epi16(_mm_abs_epi16(v0), shr); \ ++ v1 = _mm_sra_epi16(_mm_abs_epi16(v1), shr); \ ++ v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, v1), c0); \ ++ ((uint16_t*)bits)[i] = ~_mm_movemask_epi8(v2); \ ++ _mm_store_si128((__m128i*)absvalues, v0); \ ++ _mm_store_si128((__m128i*)absvalues + 1, v1); \ ++ v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, v1), c1); \ ++ mask1.w[i] = _mm_movemask_epi8(v2); \ ++ absvalues += 16; ++ ++#define BLOCK8 \ ++ v0 = _mm_load_si128((__m128i*)coefs); \ ++ v2 = _mm_packs_epi16(v0, c0); \ ++ ((uint16_t*)bits)[4 + i] = ~_mm_movemask_epi8(v2); \ ++ v0 = _mm_sra_epi16(_mm_abs_epi16(v0), shr); \ ++ v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, c0), c0); \ ++ ((uint16_t*)bits)[i] = ~_mm_movemask_epi8(v2); \ ++ _mm_store_si128((__m128i*)absvalues, v0); \ ++ v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, c0), c1); \ ++ mask1.w[i] = _mm_movemask_epi8(v2); \ ++ absvalues += 8; ++ ++ ((uint64_t*)bits)[0] = 0; /* zero */ ++ ((uint64_t*)bits)[1] = 0; /* sign */ ++ LOOP ++#undef BLOCK16 ++#undef BLOCK8 ++ ++ for (i = (64 - Sl) >> 3; i; i--) { ++ _mm_store_si128((__m128i*)absvalues, c0); ++ absvalues += 8; ++ } ++ ++ return 63 - __builtin_clzll(mask1.q | 1); ++} +diff --git a/simd/e2k/jcsample-e2k.c b/simd/e2k/jcsample-e2k.c +new file mode 100644 +index 0000000..cac8897 +--- /dev/null ++++ b/simd/e2k/jcsample-e2k.c +@@ -0,0 +1,203 @@ ++/* ++ * Elbrus optimizations for libjpeg-turbo ++ * ++ * Copyright (C) 2015, D. R. Commander. All Rights Reserved. ++ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd ++ * ++ * This software is provided 'as-is', without any express or implied ++ * warranty. In no event will the authors be held liable for any damages ++ * arising from the use of this software. ++ * ++ * Permission is granted to anyone to use this software for any purpose, ++ * including commercial applications, and to alter it and redistribute it ++ * freely, subject to the following restrictions: ++ * ++ * 1. The origin of this software must not be misrepresented; you must not ++ * claim that you wrote the original software. If you use this software ++ * in a product, an acknowledgment in the product documentation would be ++ * appreciated but is not required. ++ * 2. Altered source versions must be plainly marked as such, and must not be ++ * misrepresented as being the original software. ++ * 3. This notice may not be removed or altered from any source distribution. ++ */ ++ ++/* CHROMA DOWNSAMPLING */ ++ ++#include "jsimd_e2k.h" ++#include "jcsample.h" ++ ++void jsimd_h2v1_downsample_e2k(JDIMENSION image_width, ++ int max_v_samp_factor, ++ JDIMENSION v_samp_factor, ++ JDIMENSION width_in_blocks, ++ JSAMPARRAY input_data, ++ JSAMPARRAY output_data) ++{ ++ int outcol; ++ JDIMENSION output_cols = width_in_blocks * DCTSIZE, outrow; ++ JSAMPROW inptr, outptr; ++ ++ __m128i this0, next0, out; ++ __m128i this0e, this0o, next0e, next0o, outl, outh; ++ ++ /* Constants */ ++ __m128i pw_bias = _mm_set1_epi32(1 << 16), ++ even_mask = _mm_set1_epi16(255); ++#ifdef NEED_ALIGN8 ++ ALIGN8_COMMON ++ ALIGN8_VARS(src) ++#endif ++ ++ expand_right_edge(input_data, max_v_samp_factor, ++ image_width, output_cols * 2); ++ ++ if (output_cols > 0) ++ for (outrow = 0; outrow < v_samp_factor; outrow++) { ++ outptr = output_data[outrow]; ++ inptr = input_data[outrow]; ++ ++#ifdef NEED_ALIGN8 ++ ALIGN8_START(inptr, src) ++#endif ++ PRAGMA_E2K("ivdep") ++ for (outcol = output_cols; outcol > 8; ++ outcol -= 16, outptr += 16) { ++ ++#ifdef NEED_ALIGN8 ++ ALIGN8_READ16(this0, src, 0) ++ ALIGN8_READ16(next0, src, 1) ++ src_ptr += 4; ++#else ++ this0 = VEC_LD(inptr); ++ next0 = VEC_LD(inptr + 16); ++ inptr += 32; ++#endif ++ this0e = _mm_and_si128(this0, even_mask); ++ this0o = _mm_srli_epi16(this0, 8); ++ outl = _mm_add_epi16(this0e, this0o); ++ outl = _mm_srli_epi16(_mm_add_epi16(outl, pw_bias), 1); ++ next0e = _mm_and_si128(next0, even_mask); ++ next0o = _mm_srli_epi16(next0, 8); ++ outh = _mm_add_epi16(next0e, next0o); ++ outh = _mm_srli_epi16(_mm_add_epi16(outh, pw_bias), 1); ++ ++ out = _mm_packus_epi16(outl, outh); ++ VEC_ST(outptr, out); ++ } ++ if (outcol > 0) { ++#ifdef NEED_ALIGN8 ++ ALIGN8_READ16(this0, src, 0) ++#else ++ this0 = VEC_LD(inptr); ++#endif ++ this0e = _mm_and_si128(this0, even_mask); ++ this0o = _mm_srli_epi16(this0, 8); ++ outl = _mm_add_epi16(this0e, this0o); ++ outl = _mm_srli_epi16(_mm_add_epi16(outl, pw_bias), 1); ++ ++ out = _mm_packus_epi16(outl, outl); ++ VEC_STL(outptr, out); ++ } ++ } ++} ++ ++ ++void jsimd_h2v2_downsample_e2k(JDIMENSION image_width, int max_v_samp_factor, ++ JDIMENSION v_samp_factor, ++ JDIMENSION width_in_blocks, ++ JSAMPARRAY input_data, JSAMPARRAY output_data) ++{ ++ int outcol; ++ JDIMENSION output_cols = width_in_blocks * DCTSIZE, outrow; ++ JSAMPROW inptr0, inptr1, outptr; ++ ++ __m128i this0, next0, this1, next1, out; ++ __m128i this0e, this0o, next0e, next0o, this1e, this1o, ++ next1e, next1o, out0l, out0h, out1l, out1h, outl, outh; ++ ++ /* Constants */ ++ __m128i pw_bias = _mm_set1_epi32(1 | 2 << 16), ++ even_mask = _mm_set1_epi16(255); ++#ifdef NEED_ALIGN8 ++ ALIGN8_COMMON ++ ALIGN8_VARS(src0) ++ ALIGN8_VARS(src1) ++#endif ++ ++ expand_right_edge(input_data, max_v_samp_factor, ++ image_width, output_cols * 2); ++ ++ if (output_cols > 0) ++ for (outrow = 0; outrow < v_samp_factor; outrow++) { ++ inptr0 = input_data[outrow * 2]; ++ inptr1 = input_data[outrow * 2 + 1]; ++ outptr = output_data[outrow]; ++ ++#ifdef NEED_ALIGN8 ++ ALIGN8_START(inptr0, src0) ++ ALIGN8_START(inptr1, src1) ++#endif ++ PRAGMA_E2K("ivdep") ++ for (outcol = output_cols; outcol > 8; ++ outcol -= 16, outptr += 16) { ++ ++#ifdef NEED_ALIGN8 ++ ALIGN8_READ16(this0, src0, 0) src0_ptr += 2; ++ ALIGN8_READ16(this1, src1, 0) src1_ptr += 2; ++#else ++ this0 = VEC_LD(inptr0); inptr0 += 16; ++ this1 = VEC_LD(inptr1); inptr1 += 16; ++#endif ++ this0e = _mm_and_si128(this0, even_mask); ++ this1e = _mm_and_si128(this1, even_mask); ++ this0o = _mm_srli_epi16(this0, 8); ++ this1o = _mm_srli_epi16(this1, 8); ++ out0l = _mm_add_epi16(this0e, this0o); ++ out1l = _mm_add_epi16(this1e, this1o); ++ ++ outl = _mm_add_epi16(out0l, out1l); ++ outl = _mm_srli_epi16(_mm_add_epi16(outl, pw_bias), 2); ++ ++#ifdef NEED_ALIGN8 ++ ALIGN8_READ16(next0, src0, 0) src0_ptr += 2; ++ ALIGN8_READ16(next1, src1, 0) src1_ptr += 2; ++#else ++ next0 = VEC_LD(inptr0); inptr0 += 16; ++ next1 = VEC_LD(inptr1); inptr1 += 16; ++#endif ++ next0e = _mm_and_si128(next0, even_mask); ++ next1e = _mm_and_si128(next1, even_mask); ++ next0o = _mm_srli_epi16(next0, 8); ++ next1o = _mm_srli_epi16(next1, 8); ++ out0h = _mm_add_epi16(next0e, next0o); ++ out1h = _mm_add_epi16(next1e, next1o); ++ ++ outh = _mm_add_epi16(out0h, out1h); ++ outh = _mm_srli_epi16(_mm_add_epi16(outh, pw_bias), 2); ++ ++ out = _mm_packus_epi16(outl, outh); ++ VEC_ST(outptr, out); ++ } ++ if (outcol > 0) { ++#ifdef NEED_ALIGN8 ++ ALIGN8_READ16(this0, src0, 0) ++ ALIGN8_READ16(this1, src1, 0) ++#else ++ this0 = VEC_LD(inptr0); ++ this1 = VEC_LD(inptr1); ++#endif ++ this0e = _mm_and_si128(this0, even_mask); ++ this1e = _mm_and_si128(this1, even_mask); ++ this0o = _mm_srli_epi16(this0, 8); ++ this1o = _mm_srli_epi16(this1, 8); ++ out0l = _mm_add_epi16(this0e, this0o); ++ out1l = _mm_add_epi16(this1e, this1o); ++ ++ outl = _mm_add_epi16(out0l, out1l); ++ outl = _mm_srli_epi16(_mm_add_epi16(outl, pw_bias), 2); ++ ++ out = _mm_packus_epi16(outl, outl); ++ VEC_STL(outptr, out); ++ } ++ } ++} +diff --git a/simd/e2k/jcsample.h b/simd/e2k/jcsample.h +new file mode 100644 +index 0000000..2ac4816 +--- /dev/null ++++ b/simd/e2k/jcsample.h +@@ -0,0 +1,28 @@ ++/* ++ * jcsample.h ++ * ++ * This file was part of the Independent JPEG Group's software: ++ * Copyright (C) 1991-1996, Thomas G. Lane. ++ * For conditions of distribution and use, see the accompanying README.ijg ++ * file. ++ */ ++ ++LOCAL(void) ++expand_right_edge(JSAMPARRAY image_data, int num_rows, JDIMENSION input_cols, ++ JDIMENSION output_cols) ++{ ++ register JSAMPROW ptr; ++ register JSAMPLE pixval; ++ register int count; ++ int row; ++ int numcols = (int)(output_cols - input_cols); ++ ++ if (numcols > 0) { ++ for (row = 0; row < num_rows; row++) { ++ ptr = image_data[row] + input_cols; ++ pixval = ptr[-1]; /* don't need GETJSAMPLE() here */ ++ for (count = numcols; count > 0; count--) ++ *ptr++ = pixval; ++ } ++ } ++} +diff --git a/simd/e2k/jdcolext-e2k.c b/simd/e2k/jdcolext-e2k.c +new file mode 100644 +index 0000000..4f12aef +--- /dev/null ++++ b/simd/e2k/jdcolext-e2k.c +@@ -0,0 +1,258 @@ ++/* ++ * Elbrus optimizations for libjpeg-turbo ++ * ++ * Copyright (C) 2015, D. R. Commander. All Rights Reserved. ++ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd ++ * ++ * This software is provided 'as-is', without any express or implied ++ * warranty. In no event will the authors be held liable for any damages ++ * arising from the use of this software. ++ * ++ * Permission is granted to anyone to use this software for any purpose, ++ * including commercial applications, and to alter it and redistribute it ++ * freely, subject to the following restrictions: ++ * ++ * 1. The origin of this software must not be misrepresented; you must not ++ * claim that you wrote the original software. If you use this software ++ * in a product, an acknowledgment in the product documentation would be ++ * appreciated but is not required. ++ * 2. Altered source versions must be plainly marked as such, and must not be ++ * misrepresented as being the original software. ++ * 3. This notice may not be removed or altered from any source distribution. ++ */ ++ ++/* This file is included by jdcolor-e2k.c */ ++ ++void ycc_rgbn_convert(JDIMENSION out_width, JSAMPIMAGE input_buf, ++ JDIMENSION input_row, JSAMPARRAY output_buf, ++ int num_rows, int shuf_idx) ++{ ++ JSAMPROW outptr, inptr0, inptr1, inptr2; ++ uint8_t __attribute__((aligned(16))) tmpbuf[PIXELSIZE * 16]; ++ ++ __m128i rgb0, rgb1, rgb2, rgb3, y, cb, cr; ++ __m128i rg0, rg1, bx0, bx1, yl, yh, cbl, cbh, ++ crl, crh, rl, rh, gl, gh, bl, bh, g0w, g1w, g2w, g3w; ++ __m128i g0, g1, g2, g3; ++ ++ /* Constants ++ * NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17 ++ * high-order bits, not 16. ++ */ ++ __m128i pw_f0402 = _mm_set1_epi16(F_0_402 >> 1), ++ pw_mf0228 = _mm_set1_epi16(-F_0_228 >> 1), ++ pw_mf0344_f0285 = _mm_setr_epi16(__4X2(-F_0_344, F_0_285)), ++ pb_255 = _mm_set1_epi8(-1), ++ pw_cj = _mm_set1_epi16(CENTERJSAMPLE), ++ pd_onehalf = _mm_set1_epi32(ONE_HALF), ++ pb_zero = _mm_setzero_si128(); ++ RGB_SHUFFLE_INIT ++#ifdef NEED_ALIGN8 ++ ALIGN8_COMMON ++ ALIGN8_VARS(src0) ++ ALIGN8_VARS(src1) ++ ALIGN8_VARS(src2) ++#endif ++ ++ if (out_width > 0) ++ while (--num_rows >= 0) { ++ int num_cols; ++ inptr0 = input_buf[0][input_row]; ++ inptr1 = input_buf[1][input_row]; ++ inptr2 = input_buf[2][input_row]; ++ input_row++; ++ outptr = *output_buf++; ++ ++ if (out_width >= 16) { ++#ifdef NEED_ALIGN8 ++ ALIGN8_START(inptr0, src0) ++ ALIGN8_START(inptr1, src1) ++ ALIGN8_START(inptr2, src2) ++ inptr0 += out_width & -16; ++ inptr1 += out_width & -16; ++ inptr2 += out_width & -16; ++#endif ++ PRAGMA_E2K("ivdep") ++ for (num_cols = out_width; num_cols >= 16; ++ num_cols -= 16, outptr += PIXELSIZE * 16) { ++#ifdef NEED_ALIGN8 ++ ALIGN8_READ16(y, src0, 0) src0_ptr += 2; ++ ALIGN8_READ16(cb, src1, 0) src1_ptr += 2; ++ ALIGN8_READ16(cr, src2, 0) src2_ptr += 2; ++#else ++ y = VEC_LD(inptr0); inptr0 += 16; ++ cb = VEC_LD(inptr1); inptr1 += 16; ++ cr = VEC_LD(inptr2); inptr2 += 16; ++#endif ++ CALC_RGB ++ RGB_SHUFFLE ++ VEC_ST(outptr, rgb0); ++ VEC_ST(outptr + 16, rgb1); ++ VEC_ST(outptr + 32, rgb2); ++#if PIXELSIZE == 4 ++ VEC_ST(outptr + 48, rgb3); ++#endif ++ } ++ } ++ ++ num_cols = out_width & 15; ++ if (num_cols) { ++ int i; ++ for (i = 0; i < num_cols; i++) { ++ tmpbuf[i] = inptr0[i]; ++ tmpbuf[i + 16] = inptr1[i]; ++ tmpbuf[i + 32] = inptr2[i]; ++ } ++ y = VEC_LD(tmpbuf); ++ cb = VEC_LD(tmpbuf + 16); ++ cr = VEC_LD(tmpbuf + 32); ++ CALC_RGB ++ RGB_SHUFFLE ++ VEC_ST(tmpbuf, rgb0); ++ VEC_ST(tmpbuf + 16, rgb1); ++ VEC_ST(tmpbuf + 32, rgb2); ++#if PIXELSIZE == 4 ++ VEC_ST(tmpbuf + 48, rgb3); ++#endif ++ memcpy(outptr, tmpbuf, num_cols * PIXELSIZE); ++ } ++ } ++} ++ ++void ycc_rgbn_merged(JDIMENSION out_width, JSAMPIMAGE input_buf, ++ JDIMENSION in_row_group_ctr, ++ JDIMENSION in_row_group_ctr_y, ++ JSAMPARRAY output_buf, int shuf_idx) ++{ ++ JSAMPROW outptr, inptr0, inptr1, inptr2; ++ int num_cols; ++ uint8_t __attribute__((aligned(16))) tmpbuf[4 * 16]; ++ ++ __m128i rgb0, rgb1, rgb2, rgb3, y, cb, cr; ++ __m128i rg0, rg1, bx0, bx1, yl, yh, cbl, cbh, ++ crl, crh, r_yl, r_yh, g_yl, g_yh, b_yl, b_yh, g_y0w, g_y1w, g_y2w, g_y3w, ++ rl, rh, gl, gh, bl, bh; ++ __m128i g_y0, g_y1, g_y2, g_y3; ++ ++ /* Constants ++ * NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17 ++ * high-order bits, not 16. ++ */ ++ __m128i pw_f0402 = _mm_set1_epi16(F_0_402 >> 1), ++ pw_mf0228 = _mm_set1_epi16(-F_0_228 >> 1), ++ pw_mf0344_f0285 = _mm_setr_epi16(__4X2(-F_0_344, F_0_285)), ++ pb_255 = _mm_set1_epi8(-1), ++ pw_cj = _mm_set1_epi16(CENTERJSAMPLE), ++ pd_onehalf = _mm_set1_epi32(ONE_HALF), ++ pb_zero = _mm_setzero_si128(); ++ RGB_SHUFFLE_INIT ++#ifdef NEED_ALIGN8 ++ ALIGN8_COMMON ++ ALIGN8_VARS(src0) ++ ALIGN8_VARS(src1) ++ ALIGN8_VARS(src2) ++#endif ++ ++ inptr0 = input_buf[0][in_row_group_ctr_y]; ++ inptr1 = input_buf[1][in_row_group_ctr]; ++ inptr2 = input_buf[2][in_row_group_ctr]; ++ outptr = output_buf[0]; ++ ++ if (out_width >= 32) { ++#ifdef NEED_ALIGN8 ++ ALIGN8_START(inptr0, src0) ++ ALIGN8_START(inptr1, src1) ++ ALIGN8_START(inptr2, src2) ++ inptr0 += out_width & -32; ++ inptr1 += (out_width & -32) >> 1; ++ inptr2 += (out_width & -32) >> 1; ++#endif ++ PRAGMA_E2K("ivdep") ++ for (num_cols = out_width; num_cols >= 32; num_cols -= 32) { ++ ++#ifdef NEED_ALIGN8 ++ ALIGN8_READ16(cb, src1, 0) src1_ptr += 2; ++ ALIGN8_READ16(cr, src2, 0) src2_ptr += 2; ++#else ++ cb = VEC_LD(inptr1); inptr1 += 16; ++ cr = VEC_LD(inptr2); inptr2 += 16; ++#endif ++ CALC_MERGED1 ++ ++#ifdef NEED_ALIGN8 ++ ALIGN8_READ16(y, src0, 0) src0_ptr += 2; ++#else ++ y = VEC_LD(inptr0); inptr0 += 16; ++#endif ++ CALC_MERGED2(r_yl, g_yl, b_yl) ++ RGB_SHUFFLE ++ VEC_ST(outptr, rgb0); ++ VEC_ST(outptr + 16, rgb1); ++ VEC_ST(outptr + 32, rgb2); ++#if PIXELSIZE == 4 ++ VEC_ST(outptr + 48, rgb3); ++#endif ++ outptr += PIXELSIZE * 16; ++ ++#ifdef NEED_ALIGN8 ++ ALIGN8_READ16(y, src0, 0) src0_ptr += 2; ++#else ++ y = VEC_LD(inptr0); inptr0 += 16; ++#endif ++ CALC_MERGED2(r_yh, g_yh, b_yh) ++ RGB_SHUFFLE ++ VEC_ST(outptr, rgb0); ++ VEC_ST(outptr + 16, rgb1); ++ VEC_ST(outptr + 32, rgb2); ++#if PIXELSIZE == 4 ++ VEC_ST(outptr + 48, rgb3); ++#endif ++ outptr += PIXELSIZE * 16; ++ } ++ } ++ ++ num_cols = out_width & 31; ++ if (num_cols) { ++ int i; ++ for (i = 0; i < (num_cols + 1) >> 1; i++) { ++ tmpbuf[i] = inptr1[i]; ++ tmpbuf[i + 16] = inptr2[i]; ++ tmpbuf[i * 2 + 32] = inptr0[i * 2]; ++ tmpbuf[i * 2 + 32 + 1] = inptr0[i * 2 + 1]; ++ } ++ cb = VEC_LD(tmpbuf); ++ cr = VEC_LD(tmpbuf + 16); ++ CALC_MERGED1 ++ ++ y = VEC_LD(tmpbuf + 32); ++ CALC_MERGED2(r_yl, g_yl, b_yl) ++ RGB_SHUFFLE ++ if (num_cols >= 16) { ++ VEC_ST(outptr, rgb0); ++ VEC_ST(outptr + 16, rgb1); ++ VEC_ST(outptr + 32, rgb2); ++#if PIXELSIZE == 4 ++ VEC_ST(outptr + 48, rgb3); ++#endif ++ outptr += PIXELSIZE * 16; ++ ++ y = VEC_LD(tmpbuf + 48); ++ CALC_MERGED2(r_yh, g_yh, b_yh) ++ RGB_SHUFFLE ++ } ++ VEC_ST(tmpbuf, rgb0); ++ VEC_ST(tmpbuf + 16, rgb1); ++ VEC_ST(tmpbuf + 32, rgb2); ++#if PIXELSIZE == 4 ++ VEC_ST(tmpbuf + 48, rgb3); ++#endif ++ memcpy(outptr, tmpbuf, (out_width & 15) * PIXELSIZE); ++ } ++} ++ ++#undef RGB_SHUFFLE_INIT ++#undef RGB_SHUFFLE ++#undef PIXELSIZE ++#undef ycc_rgbn_convert ++#undef ycc_rgbn_merged ++ +diff --git a/simd/e2k/jdcolor-e2k.c b/simd/e2k/jdcolor-e2k.c +new file mode 100644 +index 0000000..94c80e9 +--- /dev/null ++++ b/simd/e2k/jdcolor-e2k.c +@@ -0,0 +1,289 @@ ++/* ++ * Elbrus optimizations for libjpeg-turbo ++ * ++ * Copyright (C) 2015, D. R. Commander. All Rights Reserved. ++ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd ++ * ++ * This software is provided 'as-is', without any express or implied ++ * warranty. In no event will the authors be held liable for any damages ++ * arising from the use of this software. ++ * ++ * Permission is granted to anyone to use this software for any purpose, ++ * including commercial applications, and to alter it and redistribute it ++ * freely, subject to the following restrictions: ++ * ++ * 1. The origin of this software must not be misrepresented; you must not ++ * claim that you wrote the original software. If you use this software ++ * in a product, an acknowledgment in the product documentation would be ++ * appreciated but is not required. ++ * 2. Altered source versions must be plainly marked as such, and must not be ++ * misrepresented as being the original software. ++ * 3. This notice may not be removed or altered from any source distribution. ++ */ ++ ++/* YCC --> RGB CONVERSION */ ++ ++#include "jsimd_e2k.h" ++ ++#define F_0_344 22554 /* FIX(0.34414) */ ++#define F_0_714 46802 /* FIX(0.71414) */ ++#define F_1_402 91881 /* FIX(1.40200) */ ++#define F_1_772 116130 /* FIX(1.77200) */ ++#define F_0_402 (F_1_402 - 65536) /* FIX(1.40200) - FIX(1) */ ++#define F_0_285 (65536 - F_0_714) /* FIX(1) - FIX(0.71414) */ ++#define F_0_228 (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */ ++ ++#define SCALEBITS 16 ++#define ONE_HALF (1 << (SCALEBITS - 1)) ++ ++static const uint8_t __attribute__((aligned(16))) ++#if defined(__iset__) && __iset__ >= 5 ++ycc_rgb_shuf_const[7][48] = { ++#define SHUF_CONST3 \ ++ C0, C1, C2, \ ++ C0 + 4, C1 + 4, C2 + 4, \ ++ C0 + 8, C1 + 8, C2 + 8, \ ++ C0 + 12, C1 + 12, C2 + 12, \ ++ C0 + 16, C1 + 16, C2 + 16, \ ++ C0 + 20, C1 + 4, C2 + 4, \ ++ C0 + 8, C1 + 8, C2 + 8, \ ++ C0 + 12, C1 + 12, C2 + 12, \ ++ C0 + 16, C1 + 16, C2 + 16, \ ++ C0 + 20, C1 + 20, C2 + 20, \ ++ C0 + 24, C1 + 24, C2 + 8, \ ++ C0 + 12, C1 + 12, C2 + 12, \ ++ C0 + 16, C1 + 16, C2 + 16, \ ++ C0 + 20, C1 + 20, C2 + 20, \ ++ C0 + 24, C1 + 24, C2 + 24, \ ++ C0 + 28, C1 + 28, C2 + 28 ++#else ++ycc_rgb_shuf_const[7][24] = { ++#define SHUF_CONST3 \ ++ C0, C1, C2, \ ++ C0 + 4, C1 + 4, C2 + 4, \ ++ C0 + 8, C1 + 8, C2, \ ++ C0 + 4, C1 + 4, C2 + 4, \ ++ C0 + 8, C1 + 8, C2 + 8, \ ++ C0 + 12, C1 + 4, C2 + 4, \ ++ C0 + 8, C1 + 8, C2 + 8, \ ++ C0 + 12, C1 + 12, C2 + 12 ++#endif ++ ++#define SHUF_CONST4 C0, C1, C2, C3, C0 + 4, C1 + 4, C2 + 4, C3 + 4, \ ++ C0 + 8, C1 + 8, C2 + 8, C3 + 8, C0 + 12, C1 + 12, C2 + 12, C3 + 12 ++ ++#define TMP_RED RGB_RED ++#define TMP_GREEN RGB_GREEN ++#define TMP_BLUE RGB_BLUE ++#define PIXELSIZE RGB_PIXELSIZE ++#include "jdcoltab-e2k.c" ++ , ++#define TMP_RED EXT_RGB_RED ++#define TMP_GREEN EXT_RGB_GREEN ++#define TMP_BLUE EXT_RGB_BLUE ++#define PIXELSIZE EXT_RGB_PIXELSIZE ++#include "jdcoltab-e2k.c" ++ , ++#define TMP_RED EXT_RGBX_RED ++#define TMP_GREEN EXT_RGBX_GREEN ++#define TMP_BLUE EXT_RGBX_BLUE ++#define PIXELSIZE EXT_RGBX_PIXELSIZE ++#include "jdcoltab-e2k.c" ++ , ++#define TMP_RED EXT_BGR_RED ++#define TMP_GREEN EXT_BGR_GREEN ++#define TMP_BLUE EXT_BGR_BLUE ++#define PIXELSIZE EXT_BGR_PIXELSIZE ++#include "jdcoltab-e2k.c" ++ , ++#define TMP_RED EXT_BGRX_RED ++#define TMP_GREEN EXT_BGRX_GREEN ++#define TMP_BLUE EXT_BGRX_BLUE ++#define PIXELSIZE EXT_BGRX_PIXELSIZE ++#include "jdcoltab-e2k.c" ++ , ++#define TMP_RED EXT_XBGR_RED ++#define TMP_GREEN EXT_XBGR_GREEN ++#define TMP_BLUE EXT_XBGR_BLUE ++#define PIXELSIZE EXT_XBGR_PIXELSIZE ++#include "jdcoltab-e2k.c" ++ , ++#define TMP_RED EXT_XRGB_RED ++#define TMP_GREEN EXT_XRGB_GREEN ++#define TMP_BLUE EXT_XRGB_BLUE ++#define PIXELSIZE EXT_XRGB_PIXELSIZE ++#include "jdcoltab-e2k.c" ++}; ++ ++ /* (Original) ++ * R = Y + 1.40200 * Cr ++ * G = Y - 0.34414 * Cb - 0.71414 * Cr ++ * B = Y + 1.77200 * Cb ++ * ++ * (This implementation) ++ * R = Y + 0.40200 * Cr + Cr ++ * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr ++ * B = Y - 0.22800 * Cb + Cb + Cb ++ */ ++ ++#define CALC_RGB \ ++ yl = _mm_unpacklo_epi8(y, pb_zero); \ ++ yh = _mm_unpackhi_epi8(y, pb_zero); \ ++ \ ++ cbl = _mm_unpacklo_epi8(cb, pb_zero); \ ++ cbh = _mm_unpackhi_epi8(cb, pb_zero); \ ++ cbl = _mm_sub_epi16(cbl, pw_cj); \ ++ cbh = _mm_sub_epi16(cbh, pw_cj); \ ++ \ ++ crl = _mm_unpacklo_epi8(cr, pb_zero); \ ++ crh = _mm_unpackhi_epi8(cr, pb_zero); \ ++ crl = _mm_sub_epi16(crl, pw_cj); \ ++ crh = _mm_sub_epi16(crh, pw_cj); \ ++ \ ++ bl = _mm_mulhrs_epi16(cbl, pw_mf0228); \ ++ bh = _mm_mulhrs_epi16(cbh, pw_mf0228); \ ++ bl = _mm_add_epi16(bl, _mm_add_epi16(cbl, cbl)); \ ++ bh = _mm_add_epi16(bh, _mm_add_epi16(cbh, cbh)); \ ++ bl = _mm_add_epi16(bl, yl); \ ++ bh = _mm_add_epi16(bh, yh); \ ++ \ ++ rl = _mm_mulhrs_epi16(crl, pw_f0402); \ ++ rh = _mm_mulhrs_epi16(crh, pw_f0402); \ ++ rl = _mm_add_epi16(rl, crl); \ ++ rh = _mm_add_epi16(rh, crh); \ ++ rl = _mm_add_epi16(rl, yl); \ ++ rh = _mm_add_epi16(rh, yh); \ ++ \ ++ g0w = _mm_unpacklo_epi16(cbl, crl); \ ++ g1w = _mm_unpackhi_epi16(cbl, crl); \ ++ g0 = _mm_add_epi32(_mm_madd_epi16(g0w, pw_mf0344_f0285), pd_onehalf); \ ++ g1 = _mm_add_epi32(_mm_madd_epi16(g1w, pw_mf0344_f0285), pd_onehalf); \ ++ g2w = _mm_unpacklo_epi16(cbh, crh); \ ++ g3w = _mm_unpackhi_epi16(cbh, crh); \ ++ g2 = _mm_add_epi32(_mm_madd_epi16(g2w, pw_mf0344_f0285), pd_onehalf); \ ++ g3 = _mm_add_epi32(_mm_madd_epi16(g3w, pw_mf0344_f0285), pd_onehalf); \ ++ \ ++ gl = _mm_packhi_epi32(g0, g1); \ ++ gh = _mm_packhi_epi32(g2, g3); \ ++ gl = _mm_sub_epi16(gl, crl); \ ++ gh = _mm_sub_epi16(gh, crh); \ ++ gl = _mm_add_epi16(gl, yl); \ ++ gh = _mm_add_epi16(gh, yh); \ ++ \ ++ rl = _mm_packus_epi16(rl, rh); \ ++ gl = _mm_packus_epi16(gl, gh); \ ++ bl = _mm_packus_epi16(bl, bh); \ ++ \ ++ rg0 = _mm_unpacklo_epi8(rl, gl); \ ++ rg1 = _mm_unpackhi_epi8(rl, gl); \ ++ bx0 = _mm_unpacklo_epi8(bl, pb_255); \ ++ bx1 = _mm_unpackhi_epi8(bl, pb_255); \ ++ \ ++ rgb0 = _mm_unpacklo_epi16(rg0, bx0); \ ++ rgb1 = _mm_unpackhi_epi16(rg0, bx0); \ ++ rgb2 = _mm_unpacklo_epi16(rg1, bx1); \ ++ rgb3 = _mm_unpackhi_epi16(rg1, bx1); ++ ++#define CALC_MERGED1 \ ++ cbl = _mm_unpacklo_epi8(cb, pb_zero); \ ++ cbh = _mm_unpackhi_epi8(cb, pb_zero); \ ++ cbl = _mm_sub_epi16(cbl, pw_cj); \ ++ cbh = _mm_sub_epi16(cbh, pw_cj); \ ++ \ ++ crl = _mm_unpacklo_epi8(cr, pb_zero); \ ++ crh = _mm_unpackhi_epi8(cr, pb_zero); \ ++ crl = _mm_sub_epi16(crl, pw_cj); \ ++ crh = _mm_sub_epi16(crh, pw_cj); \ ++ \ ++ b_yl = _mm_mulhrs_epi16(cbl, pw_mf0228); \ ++ b_yh = _mm_mulhrs_epi16(cbh, pw_mf0228); \ ++ b_yl = _mm_add_epi16(b_yl, _mm_add_epi16(cbl, cbl)); \ ++ b_yh = _mm_add_epi16(b_yh, _mm_add_epi16(cbh, cbh)); \ ++ \ ++ r_yl = _mm_mulhrs_epi16(crl, pw_f0402); \ ++ r_yh = _mm_mulhrs_epi16(crh, pw_f0402); \ ++ r_yl = _mm_add_epi16(r_yl, crl); \ ++ r_yh = _mm_add_epi16(r_yh, crh); \ ++ \ ++ g_y0w = _mm_unpacklo_epi16(cbl, crl); \ ++ g_y1w = _mm_unpackhi_epi16(cbl, crl); \ ++ g_y0 = _mm_add_epi32(_mm_madd_epi16(g_y0w, pw_mf0344_f0285), pd_onehalf); \ ++ g_y1 = _mm_add_epi32(_mm_madd_epi16(g_y1w, pw_mf0344_f0285), pd_onehalf); \ ++ g_y2w = _mm_unpacklo_epi16(cbh, crh); \ ++ g_y3w = _mm_unpackhi_epi16(cbh, crh); \ ++ g_y2 = _mm_add_epi32(_mm_madd_epi16(g_y2w, pw_mf0344_f0285), pd_onehalf); \ ++ g_y3 = _mm_add_epi32(_mm_madd_epi16(g_y3w, pw_mf0344_f0285), pd_onehalf); \ ++ \ ++ g_yl = _mm_packhi_epi32(g_y0, g_y1); \ ++ g_yh = _mm_packhi_epi32(g_y2, g_y3); \ ++ g_yl = _mm_sub_epi16(g_yl, crl); \ ++ g_yh = _mm_sub_epi16(g_yh, crh); ++ ++#define CALC_MERGED2(r_yl, g_yl, b_yl) \ ++ yl = _mm_unpacklo_epi8(y, pb_zero); \ ++ yh = _mm_unpackhi_epi8(y, pb_zero); \ ++ bl = _mm_add_epi16(_mm_unpacklo_epi16(b_yl, b_yl), yl); \ ++ bh = _mm_add_epi16(_mm_unpackhi_epi16(b_yl, b_yl), yh); \ ++ rl = _mm_add_epi16(_mm_unpacklo_epi16(r_yl, r_yl), yl); \ ++ rh = _mm_add_epi16(_mm_unpackhi_epi16(r_yl, r_yl), yh); \ ++ gl = _mm_add_epi16(_mm_unpacklo_epi16(g_yl, g_yl), yl); \ ++ gh = _mm_add_epi16(_mm_unpackhi_epi16(g_yl, g_yl), yh); \ ++ rl = _mm_packus_epi16(rl, rh); \ ++ gl = _mm_packus_epi16(gl, gh); \ ++ bl = _mm_packus_epi16(bl, bh); \ ++ \ ++ rg0 = _mm_unpacklo_epi8(rl, gl); \ ++ rg1 = _mm_unpackhi_epi8(rl, gl); \ ++ bx0 = _mm_unpacklo_epi8(bl, pb_255); \ ++ bx1 = _mm_unpackhi_epi8(bl, pb_255); \ ++ \ ++ rgb0 = _mm_unpacklo_epi16(rg0, bx0); \ ++ rgb1 = _mm_unpackhi_epi16(rg0, bx0); \ ++ rgb2 = _mm_unpacklo_epi16(rg1, bx1); \ ++ rgb3 = _mm_unpackhi_epi16(rg1, bx1); ++ ++#define PIXELSIZE 3 ++#if defined(__iset__) && __iset__ >= 5 ++#define RGB_SHUFFLE_INIT __m128i \ ++ rgb_index0 = VEC_LD(ycc_rgb_shuf_const[shuf_idx]), \ ++ rgb_index1 = VEC_LD(ycc_rgb_shuf_const[shuf_idx] + 16), \ ++ rgb_index2 = VEC_LD(ycc_rgb_shuf_const[shuf_idx] + 32); ++#define RGB_SHUFFLE \ ++ rgb0 = _mm_shuffle2_epi8(rgb0, rgb1, rgb_index0); \ ++ rgb1 = _mm_shuffle2_epi8(rgb1, rgb2, rgb_index1); \ ++ rgb2 = _mm_shuffle2_epi8(rgb2, rgb3, rgb_index2); ++#else ++#define RGB_SHUFFLE_INIT __m64 \ ++ rgb_index0 = *(__m64*)ycc_rgb_shuf_const[shuf_idx], \ ++ rgb_index1 = *(__m64*)(ycc_rgb_shuf_const[shuf_idx] + 8), \ ++ rgb_index2 = *(__m64*)(ycc_rgb_shuf_const[shuf_idx] + 16); ++#define RGB_SHUFFLE { \ ++ union { __m128i v; __m64 d[2]; } a = { rgb0 }, \ ++ b = { rgb1 }, c = { rgb2 }, d = { rgb3 }; \ ++ a.d[0] = _mm_shuffle2_pi8(a.d[0], a.d[1], rgb_index0); \ ++ a.d[1] = _mm_shuffle2_pi8(a.d[1], b.d[0], rgb_index1); \ ++ b.d[0] = _mm_shuffle2_pi8(b.d[0], b.d[1], rgb_index2); \ ++ b.d[1] = _mm_shuffle2_pi8(c.d[0], c.d[1], rgb_index0); \ ++ c.d[0] = _mm_shuffle2_pi8(c.d[1], d.d[0], rgb_index1); \ ++ c.d[1] = _mm_shuffle2_pi8(d.d[0], d.d[1], rgb_index2); \ ++ rgb0 = a.v; rgb1 = b.v; rgb2 = c.v; \ ++} ++#endif ++ ++#define ycc_rgbn_convert jsimd_ycc_rgb3_convert_e2k ++#define ycc_rgbn_merged jsimd_ycc_rgb3_merged_upsample_e2k ++#include "jdcolext-e2k.c" ++ ++#define PIXELSIZE 4 ++#define RGB_SHUFFLE_INIT __m128i \ ++ rgb_index0 = VEC_LD(ycc_rgb_shuf_const[shuf_idx]); ++#define RGB_SHUFFLE \ ++ rgb0 = _mm_shuffle_epi8(rgb0, rgb_index0); \ ++ rgb1 = _mm_shuffle_epi8(rgb1, rgb_index0); \ ++ rgb2 = _mm_shuffle_epi8(rgb2, rgb_index0); \ ++ rgb3 = _mm_shuffle_epi8(rgb3, rgb_index0); ++ ++#define ycc_rgbn_convert jsimd_ycc_rgb4_convert_e2k ++#define ycc_rgbn_merged jsimd_ycc_rgb4_merged_upsample_e2k ++#include "jdcolext-e2k.c" ++ +diff --git a/simd/e2k/jdcoltab-e2k.c b/simd/e2k/jdcoltab-e2k.c +new file mode 100644 +index 0000000..e19666d +--- /dev/null ++++ b/simd/e2k/jdcoltab-e2k.c +@@ -0,0 +1,80 @@ ++/* ++ * Elbrus optimizations for libjpeg-turbo ++ * ++ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd ++ * ++ * This software is provided 'as-is', without any express or implied ++ * warranty. In no event will the authors be held liable for any damages ++ * arising from the use of this software. ++ * ++ * Permission is granted to anyone to use this software for any purpose, ++ * including commercial applications, and to alter it and redistribute it ++ * freely, subject to the following restrictions: ++ * ++ * 1. The origin of this software must not be misrepresented; you must not ++ * claim that you wrote the original software. If you use this software ++ * in a product, an acknowledgment in the product documentation would be ++ * appreciated but is not required. ++ * 2. Altered source versions must be plainly marked as such, and must not be ++ * misrepresented as being the original software. ++ * 3. This notice may not be removed or altered from any source distribution. ++ */ ++ ++/* This file is included by jdcolor-e2k.c */ ++ ++#if TMP_RED == 0 ++#define C0 0 ++#elif TMP_GREEN == 0 ++#define C0 1 ++#elif TMP_BLUE == 0 ++#define C0 2 ++#else ++#define C0 3 ++#endif ++ ++#if TMP_RED == 1 ++#define C1 0 ++#elif TMP_GREEN == 1 ++#define C1 1 ++#elif TMP_BLUE == 1 ++#define C1 2 ++#else ++#define C1 3 ++#endif ++ ++#if TMP_RED == 2 ++#define C2 0 ++#elif TMP_GREEN == 2 ++#define C2 1 ++#elif TMP_BLUE == 2 ++#define C2 2 ++#else ++#define C2 3 ++#endif ++ ++#if TMP_RED == 3 ++#define C3 0 ++#elif TMP_GREEN == 3 ++#define C3 1 ++#elif TMP_BLUE == 3 ++#define C3 2 ++#else ++#define C3 3 ++#endif ++ ++#if PIXELSIZE == 3 ++{ SHUF_CONST3 } ++#else ++{ SHUF_CONST4 } ++#endif ++ ++#undef C0 ++#undef C1 ++#undef C2 ++#undef C3 ++ ++#undef TMP_RED ++#undef TMP_GREEN ++#undef TMP_BLUE ++#undef PIXELSIZE ++ +diff --git a/simd/e2k/jdsample-e2k.c b/simd/e2k/jdsample-e2k.c +new file mode 100644 +index 0000000..572b3af +--- /dev/null ++++ b/simd/e2k/jdsample-e2k.c +@@ -0,0 +1,389 @@ ++/* ++ * Elbrus optimizations for libjpeg-turbo ++ * ++ * Copyright (C) 2015, D. R. Commander. All Rights Reserved. ++ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd ++ * ++ * This software is provided 'as-is', without any express or implied ++ * warranty. In no event will the authors be held liable for any damages ++ * arising from the use of this software. ++ * ++ * Permission is granted to anyone to use this software for any purpose, ++ * including commercial applications, and to alter it and redistribute it ++ * freely, subject to the following restrictions: ++ * ++ * 1. The origin of this software must not be misrepresented; you must not ++ * claim that you wrote the original software. If you use this software ++ * in a product, an acknowledgment in the product documentation would be ++ * appreciated but is not required. ++ * 2. Altered source versions must be plainly marked as such, and must not be ++ * misrepresented as being the original software. ++ * 3. This notice may not be removed or altered from any source distribution. ++ */ ++ ++/* CHROMA UPSAMPLING */ ++ ++#include "jsimd_e2k.h" ++ ++ ++void jsimd_h2v1_fancy_upsample_e2k(int max_v_samp_factor, ++ JDIMENSION downsampled_width, ++ JSAMPARRAY input_data, ++ JSAMPARRAY *output_data_ptr) ++{ ++ JSAMPARRAY output_data = *output_data_ptr; ++ JSAMPROW inptr, outptr; ++ int inrow, incol; ++ ++ __m128i pb_zero = _mm_setzero_si128(); ++ __m128i this0, last0, p_last0, next0 = pb_zero, p_next0, out; ++ __m128i this0l, this0h, last0l, last0h, ++ next0l, next0h, outle, outhe, outlo, outho; ++#ifdef NEED_ALIGN8 ++ ALIGN8_COMMON ++ ALIGN8_VARS(src) ++#endif ++ ++ /* Constants */ ++ __m128i pw_three = _mm_set1_epi16(3), ++ next_index_lastcol = _mm_setr_epi8( ++ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15), ++ pw_one = _mm_set1_epi16(1), pw_two = _mm_set1_epi16(2); ++ ++ if (downsampled_width > 0) ++ for (inrow = 0; inrow < max_v_samp_factor; inrow++) { ++ inptr = input_data[inrow]; ++ outptr = output_data[inrow]; ++ ++ if (downsampled_width & 15) ++ inptr[downsampled_width] = inptr[downsampled_width - 1]; ++ ++#ifdef NEED_ALIGN8 ++ ALIGN8_START(inptr, src) ++ ALIGN8_READ16(this0, src, 0) ++#else ++ this0 = VEC_LD(inptr); ++#endif ++ last0 = _mm_bslli_si128(this0, 15); ++ ++ PRAGMA_E2K("ivdep") ++ for (incol = downsampled_width; incol > 0; ++ incol -= 16, outptr += 32) { ++ ++ p_last0 = _mm_alignr_epi8(this0, last0, 15); ++ last0 = this0; ++ ++ if (__builtin_expect(incol <= 16, 0)) ++ p_next0 = _mm_shuffle_epi8(this0, next_index_lastcol); ++ else { ++#ifdef NEED_ALIGN8 ++ ALIGN8_READ16(next0, src, 1) src_ptr += 2; ++#else ++ next0 = VEC_LD(inptr + 16); inptr += 16; ++#endif ++ p_next0 = _mm_alignr_epi8(next0, this0, 1); ++ } ++ ++ this0l = _mm_mullo_epi16(_mm_unpacklo_epi8(this0, pb_zero), pw_three); ++ last0l = _mm_unpacklo_epi8(p_last0, pb_zero); ++ next0l = _mm_unpacklo_epi8(p_next0, pb_zero); ++ last0l = _mm_add_epi16(last0l, pw_one); ++ next0l = _mm_add_epi16(next0l, pw_two); ++ ++ outle = _mm_add_epi16(this0l, last0l); ++ outlo = _mm_add_epi16(this0l, next0l); ++ outle = _mm_srli_epi16(outle, 2); ++ outlo = _mm_srli_epi16(outlo, 2); ++ ++ out = _mm_or_si128(outle, _mm_slli_epi16(outlo, 8)); ++ VEC_ST(outptr, out); ++ ++ if (__builtin_expect(incol <= 8, 0)) break; ++ ++ this0h = _mm_mullo_epi16(_mm_unpackhi_epi8(this0, pb_zero), pw_three); ++ last0h = _mm_unpackhi_epi8(p_last0, pb_zero); ++ next0h = _mm_unpackhi_epi8(p_next0, pb_zero); ++ last0h = _mm_add_epi16(last0h, pw_one); ++ next0h = _mm_add_epi16(next0h, pw_two); ++ ++ outhe = _mm_add_epi16(this0h, last0h); ++ outho = _mm_add_epi16(this0h, next0h); ++ outhe = _mm_srli_epi16(outhe, 2); ++ outho = _mm_srli_epi16(outho, 2); ++ ++ out = _mm_or_si128(outhe, _mm_slli_epi16(outho, 8)); ++ VEC_ST(outptr + 16, out); ++ ++ this0 = next0; ++ } ++ } ++} ++ ++ ++void jsimd_h2v2_fancy_upsample_e2k(int max_v_samp_factor, ++ JDIMENSION downsampled_width, ++ JSAMPARRAY input_data, ++ JSAMPARRAY *output_data_ptr) ++{ ++ JSAMPARRAY output_data = *output_data_ptr; ++ JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1; ++ int inrow, outrow, incol; ++#ifdef NEED_ALIGN8 ++ ALIGN8_COMMON ++ ALIGN8_VARS(src_1) ++ ALIGN8_VARS(src0) ++ ALIGN8_VARS(src1) ++#endif ++ ++ __m128i pb_zero = _mm_setzero_si128(); ++ __m128i this_1, this0, this1, out; ++ __m128i this_1l, this_1h, this0l, this0h, this1l, this1h, ++ lastcolsum_1h, lastcolsum1h, ++ p_lastcolsum_1l, p_lastcolsum_1h, p_lastcolsum1l, p_lastcolsum1h, ++ thiscolsum_1l, thiscolsum_1h, thiscolsum1l, thiscolsum1h, ++ nextcolsum_1l = pb_zero, nextcolsum_1h = pb_zero, ++ nextcolsum1l = pb_zero, nextcolsum1h = pb_zero, ++ p_nextcolsum_1l, p_nextcolsum_1h, p_nextcolsum1l, p_nextcolsum1h, ++ tmpl, tmph, outle, outhe, outlo, outho; ++ ++ /* Constants */ ++ __m128i pw_three = _mm_set1_epi16(3), ++ pw_seven = _mm_set1_epi16(7), pw_eight = _mm_set1_epi16(8), ++ next_index_lastcol = _mm_setr_epi8( ++ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14, 15); ++ ++ if (downsampled_width > 0) ++ for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) { ++ ++ inptr_1 = input_data[inrow - 1]; ++ inptr0 = input_data[inrow]; ++ inptr1 = input_data[inrow + 1]; ++ outptr0 = output_data[outrow++]; ++ outptr1 = output_data[outrow++]; ++ ++ if (downsampled_width & 15) { ++ inptr_1[downsampled_width] = inptr_1[downsampled_width - 1]; ++ inptr0[downsampled_width] = inptr0[downsampled_width - 1]; ++ inptr1[downsampled_width] = inptr1[downsampled_width - 1]; ++ } ++ ++#ifdef NEED_ALIGN8 ++ ALIGN8_START(inptr0, src0) ++ ALIGN8_START(inptr_1, src_1) ++ ALIGN8_START(inptr1, src1) ++ ALIGN8_READ16(this0, src0, 0) ++ ALIGN8_READ16(this_1, src_1, 0) ++ ALIGN8_READ16(this1, src1, 0) ++#else ++ this0 = VEC_LD(inptr0); ++ this_1 = VEC_LD(inptr_1); ++ this1 = VEC_LD(inptr1); ++#endif ++ ++ this0l = _mm_unpacklo_epi8(this0, pb_zero); ++ this0h = _mm_unpackhi_epi8(this0, pb_zero); ++ this0l = _mm_mullo_epi16(this0l, pw_three); ++ this0h = _mm_mullo_epi16(this0h, pw_three); ++ ++ this_1l = _mm_unpacklo_epi8(this_1, pb_zero); ++ this_1h = _mm_unpackhi_epi8(this_1, pb_zero); ++ thiscolsum_1l = _mm_add_epi16(this0l, this_1l); ++ thiscolsum_1h = _mm_add_epi16(this0h, this_1h); ++ lastcolsum_1h = _mm_bslli_si128(thiscolsum_1l, 14);; ++ ++ this1l = _mm_unpacklo_epi8(this1, pb_zero); ++ this1h = _mm_unpackhi_epi8(this1, pb_zero); ++ thiscolsum1l = _mm_add_epi16(this0l, this1l); ++ thiscolsum1h = _mm_add_epi16(this0h, this1h); ++ lastcolsum1h = _mm_bslli_si128(thiscolsum1l, 14); ++ ++ PRAGMA_E2K("ivdep") ++ for (incol = downsampled_width; incol > 0; ++ incol -= 16, outptr0 += 32, outptr1 += 32) { ++ ++ p_lastcolsum_1l = _mm_alignr_epi8(thiscolsum_1l, lastcolsum_1h, 14); ++ p_lastcolsum_1h = _mm_alignr_epi8(thiscolsum_1h, thiscolsum_1l, 14); ++ p_lastcolsum1l = _mm_alignr_epi8(thiscolsum1l, lastcolsum1h, 14); ++ p_lastcolsum1h = _mm_alignr_epi8(thiscolsum1h, thiscolsum1l, 14); ++ lastcolsum_1h = thiscolsum_1h; ++ lastcolsum1h = thiscolsum1h; ++ ++ if (__builtin_expect(incol <= 16, 0)) { ++ p_nextcolsum_1l = _mm_alignr_epi8(thiscolsum_1h, thiscolsum_1l, 2); ++ p_nextcolsum_1h = _mm_shuffle_epi8(thiscolsum_1h, next_index_lastcol); ++ p_nextcolsum1l = _mm_alignr_epi8(thiscolsum1h, thiscolsum1l, 2); ++ p_nextcolsum1h = _mm_shuffle_epi8(thiscolsum1h, next_index_lastcol); ++ } else { ++#ifdef NEED_ALIGN8 ++ ALIGN8_READ16(this0, src0, 1) src0_ptr += 2; ++ ALIGN8_READ16(this_1, src_1, 1) src_1_ptr += 2; ++ ALIGN8_READ16(this1, src1, 1) src1_ptr += 2; ++#else ++ this0 = VEC_LD(inptr0 + 16); inptr0 += 16; ++ this_1 = VEC_LD(inptr_1 + 16); inptr_1 += 16; ++ this1 = VEC_LD(inptr1 + 16); inptr1 += 16; ++#endif ++ this0l = _mm_unpacklo_epi8(this0, pb_zero); ++ this0h = _mm_unpackhi_epi8(this0, pb_zero); ++ this0l = _mm_mullo_epi16(this0l, pw_three); ++ this0h = _mm_mullo_epi16(this0h, pw_three); ++ ++ this_1l = _mm_unpacklo_epi8(this_1, pb_zero); ++ this_1h = _mm_unpackhi_epi8(this_1, pb_zero); ++ nextcolsum_1l = _mm_add_epi16(this0l, this_1l); ++ nextcolsum_1h = _mm_add_epi16(this0h, this_1h); ++ p_nextcolsum_1l = _mm_alignr_epi8(thiscolsum_1h, thiscolsum_1l, 2); ++ p_nextcolsum_1h = _mm_alignr_epi8(nextcolsum_1l, thiscolsum_1h, 2); ++ ++ this1l = _mm_unpacklo_epi8(this1, pb_zero); ++ this1h = _mm_unpackhi_epi8(this1, pb_zero); ++ nextcolsum1l = _mm_add_epi16(this0l, this1l); ++ nextcolsum1h = _mm_add_epi16(this0h, this1h); ++ p_nextcolsum1l = _mm_alignr_epi8(thiscolsum1h, thiscolsum1l, 2); ++ p_nextcolsum1h = _mm_alignr_epi8(nextcolsum1l, thiscolsum1h, 2); ++ } ++ ++ /* Process the upper row */ ++ ++ tmpl = _mm_mullo_epi16(thiscolsum_1l, pw_three); ++ outle = _mm_add_epi16(tmpl, p_lastcolsum_1l); ++ outle = _mm_add_epi16(outle, pw_eight); ++ outle = _mm_srli_epi16(outle, 4); ++ ++ outlo = _mm_add_epi16(tmpl, p_nextcolsum_1l); ++ outlo = _mm_add_epi16(outlo, pw_seven); ++ outlo = _mm_srli_epi16(outlo, 4); ++ ++ out = _mm_or_si128(outle, _mm_slli_epi16(outlo, 8)); ++ VEC_ST(outptr0, out); ++ ++ /* Process the lower row */ ++ ++ tmpl = _mm_mullo_epi16(thiscolsum1l, pw_three); ++ outle = _mm_add_epi16(tmpl, p_lastcolsum1l); ++ outle = _mm_add_epi16(outle, pw_eight); ++ outle = _mm_srli_epi16(outle, 4); ++ ++ outlo = _mm_add_epi16(tmpl, p_nextcolsum1l); ++ outlo = _mm_add_epi16(outlo, pw_seven); ++ outlo = _mm_srli_epi16(outlo, 4); ++ ++ out = _mm_or_si128(outle, _mm_slli_epi16(outlo, 8)); ++ VEC_ST(outptr1, out); ++ ++ if (__builtin_expect(incol <= 8, 0)) break; ++ ++ tmph = _mm_mullo_epi16(thiscolsum_1h, pw_three); ++ outhe = _mm_add_epi16(tmph, p_lastcolsum_1h); ++ outhe = _mm_add_epi16(outhe, pw_eight); ++ outhe = _mm_srli_epi16(outhe, 4); ++ ++ outho = _mm_add_epi16(tmph, p_nextcolsum_1h); ++ outho = _mm_add_epi16(outho, pw_seven); ++ outho = _mm_srli_epi16(outho, 4); ++ ++ out = _mm_or_si128(outhe, _mm_slli_epi16(outho, 8)); ++ VEC_ST(outptr0 + 16, out); ++ ++ tmph = _mm_mullo_epi16(thiscolsum1h, pw_three); ++ outhe = _mm_add_epi16(tmph, p_lastcolsum1h); ++ outhe = _mm_add_epi16(outhe, pw_eight); ++ outhe = _mm_srli_epi16(outhe, 4); ++ ++ outho = _mm_add_epi16(tmph, p_nextcolsum1h); ++ outho = _mm_add_epi16(outho, pw_seven); ++ outho = _mm_srli_epi16(outho, 4); ++ ++ out = _mm_or_si128(outhe, _mm_slli_epi16(outho, 8)); ++ VEC_ST(outptr1 + 16, out); ++ ++ thiscolsum_1l = nextcolsum_1l; thiscolsum_1h = nextcolsum_1h; ++ thiscolsum1l = nextcolsum1l; thiscolsum1h = nextcolsum1h; ++ } ++ } ++} ++ ++ ++/* These are rarely used (mainly just for decompressing YCCK images) */ ++ ++void jsimd_h2v1_upsample_e2k(int max_v_samp_factor, ++ JDIMENSION out_width, ++ JSAMPARRAY input_data, ++ JSAMPARRAY *output_data_ptr) ++{ ++ JSAMPARRAY output_data = *output_data_ptr; ++ JSAMPROW inptr, outptr; ++ int inrow, incol; ++ ++ __m128i in, inl, inh; ++#ifdef NEED_ALIGN8 ++ ALIGN8_COMMON ++ ALIGN8_VARS(src) ++#endif ++ ++ if (out_width > 0) ++ for (inrow = 0; inrow < max_v_samp_factor; inrow++) { ++ inptr = input_data[inrow]; ++ outptr = output_data[inrow]; ++#ifdef NEED_ALIGN8 ++ ALIGN8_START(inptr, src) ++#endif ++ PRAGMA_E2K("ivdep") ++ for (incol = out_width; incol > 0; ++ incol -= 32, outptr += 32) { ++#ifdef NEED_ALIGN8 ++ ALIGN8_READ16(in, src, 0) src_ptr += 2; ++#else ++ in = VEC_LD(inptr); inptr += 16; ++#endif ++ inl = _mm_unpacklo_epi8(in, in); ++ inh = _mm_unpackhi_epi8(in, in); ++ ++ VEC_ST(outptr, inl); ++ VEC_ST(outptr + 16, inh); ++ } ++ } ++} ++ ++ ++void jsimd_h2v2_upsample_e2k(int max_v_samp_factor, ++ JDIMENSION out_width, ++ JSAMPARRAY input_data, ++ JSAMPARRAY *output_data_ptr) ++{ ++ JSAMPARRAY output_data = *output_data_ptr; ++ JSAMPROW inptr, outptr0, outptr1; ++ int inrow, outrow, incol; ++ ++ __m128i in, inl, inh; ++#ifdef NEED_ALIGN8 ++ ALIGN8_COMMON ++ ALIGN8_VARS(src) ++#endif ++ ++ if (out_width > 0) ++ for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) { ++ inptr = input_data[inrow]; ++ outptr0 = output_data[outrow++]; ++ outptr1 = output_data[outrow++]; ++#ifdef NEED_ALIGN8 ++ ALIGN8_START(inptr, src) ++#endif ++ PRAGMA_E2K("ivdep") ++ for (incol = out_width; incol > 0; ++ incol -= 32, outptr0 += 32, outptr1 += 32) { ++#ifdef NEED_ALIGN8 ++ ALIGN8_READ16(in, src, 0) src_ptr += 2; ++#else ++ in = VEC_LD(inptr); inptr += 16; ++#endif ++ inl = _mm_unpacklo_epi8(in, in); ++ inh = _mm_unpackhi_epi8(in, in); ++ ++ VEC_ST(outptr0, inl); ++ VEC_ST(outptr1, inl); ++ VEC_ST(outptr0 + 16, inh); ++ VEC_ST(outptr1 + 16, inh); ++ } ++ } ++} +diff --git a/simd/e2k/jfdctflt-e2k.c b/simd/e2k/jfdctflt-e2k.c +new file mode 100644 +index 0000000..e3c4d94 +--- /dev/null ++++ b/simd/e2k/jfdctflt-e2k.c +@@ -0,0 +1,127 @@ ++/* ++ * Elbrus optimizations for libjpeg-turbo ++ * ++ * Copyright (C) 2014, D. R. Commander. All Rights Reserved. ++ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd ++ * ++ * This software is provided 'as-is', without any express or implied ++ * warranty. In no event will the authors be held liable for any damages ++ * arising from the use of this software. ++ * ++ * Permission is granted to anyone to use this software for any purpose, ++ * including commercial applications, and to alter it and redistribute it ++ * freely, subject to the following restrictions: ++ * ++ * 1. The origin of this software must not be misrepresented; you must not ++ * claim that you wrote the original software. If you use this software ++ * in a product, an acknowledgment in the product documentation would be ++ * appreciated but is not required. ++ * 2. Altered source versions must be plainly marked as such, and must not be ++ * misrepresented as being the original software. ++ * 3. This notice may not be removed or altered from any source distribution. ++ */ ++ ++/* FLOAT FORWARD DCT */ ++ ++#include "jsimd_e2k.h" ++ ++#define DO_FDCT(in, out) { \ ++ tmp0 = _mm_add_ps(in##0, in##7); \ ++ tmp7 = _mm_sub_ps(in##0, in##7); \ ++ tmp1 = _mm_add_ps(in##1, in##6); \ ++ tmp6 = _mm_sub_ps(in##1, in##6); \ ++ tmp2 = _mm_add_ps(in##2, in##5); \ ++ tmp5 = _mm_sub_ps(in##2, in##5); \ ++ tmp3 = _mm_add_ps(in##3, in##4); \ ++ tmp4 = _mm_sub_ps(in##3, in##4); \ ++ \ ++ /* Even part */ \ ++ \ ++ tmp10 = _mm_add_ps(tmp0, tmp3); \ ++ tmp13 = _mm_sub_ps(tmp0, tmp3); \ ++ tmp11 = _mm_add_ps(tmp1, tmp2); \ ++ tmp12 = _mm_sub_ps(tmp1, tmp2); \ ++ \ ++ out##0 = _mm_add_ps(tmp10, tmp11); \ ++ out##4 = _mm_sub_ps(tmp10, tmp11); \ ++ \ ++ z1 = _mm_mul_ps(_mm_add_ps(tmp12, tmp13), pd_f0707); \ ++ out##2 = _mm_add_ps(tmp13, z1); \ ++ out##6 = _mm_sub_ps(tmp13, z1); \ ++ \ ++ /* Odd part */ \ ++ \ ++ tmp10 = _mm_add_ps(tmp4, tmp5); \ ++ tmp11 = _mm_add_ps(tmp5, tmp6); \ ++ tmp12 = _mm_add_ps(tmp6, tmp7); \ ++ \ ++ z5 = _mm_mul_ps(_mm_sub_ps(tmp10, tmp12), pd_f0382); \ ++ z2 = _mm_add_ps(_mm_mul_ps(tmp10, pd_f0541), z5); \ ++ z4 = _mm_add_ps(_mm_mul_ps(tmp12, pd_f1306), z5); \ ++ z3 = _mm_mul_ps(tmp11, pd_f0707); \ ++ \ ++ z11 = _mm_add_ps(tmp7, z3); \ ++ z13 = _mm_sub_ps(tmp7, z3); \ ++ \ ++ out##5 = _mm_add_ps(z13, z2); \ ++ out##3 = _mm_sub_ps(z13, z2); \ ++ out##1 = _mm_add_ps(z11, z4); \ ++ out##7 = _mm_sub_ps(z11, z4); \ ++} ++ ++#define LOAD_DATA(a, b, c, d, l, i) \ ++ l##a = _mm_loadu_ps(data + a * 8 + i); \ ++ l##b = _mm_loadu_ps(data + b * 8 + i); \ ++ l##c = _mm_loadu_ps(data + c * 8 + i); \ ++ l##d = _mm_loadu_ps(data + d * 8 + i); ++ ++#define STORE_DATA(a, b, c, d, l, i) \ ++ _mm_storeu_ps(data + a * 8 + i, l##a); \ ++ _mm_storeu_ps(data + b * 8 + i, l##b); \ ++ _mm_storeu_ps(data + c * 8 + i, l##c); \ ++ _mm_storeu_ps(data + d * 8 + i, l##d); ++ ++ ++void jsimd_fdct_float_e2k(FAST_FLOAT *data) ++{ ++ __m128 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, ++ tmp10, tmp11, tmp12, tmp13, z1, z2, z3, z4, z5, z11, z13; ++ __m128 l0, l1, l2, l3, l4, l5, l6, l7; ++ __m128 h0, h1, h2, h3, h4, h5, h6, h7; ++ __m128 x0, x1, x2, x3, x4, x5, x6, x7; ++ __m128 y0, y1, y2, y3, y4, y5, y6, y7; ++ ++ /* Constants */ ++ __m128 pd_f0382 = _mm_set1_ps(0.382683433f), ++ pd_f0541 = _mm_set1_ps(0.541196100f), ++ pd_f0707 = _mm_set1_ps(0.707106781f), ++ pd_f1306 = _mm_set1_ps(1.306562965f); ++ ++ /* Pass 1: process columns */ ++ ++ LOAD_DATA(0, 1, 2, 3, x, 0) ++ LOAD_DATA(0, 1, 2, 3, y, 4) ++ TRANSPOSE_FLOAT(x0, x1, x2, x3, l0, l1, l2, l3) ++ TRANSPOSE_FLOAT(y0, y1, y2, y3, l4, l5, l6, l7) ++ DO_FDCT(l, l); ++ ++ LOAD_DATA(4, 5, 6, 7, x, 0) ++ LOAD_DATA(4, 5, 6, 7, y, 4) ++ TRANSPOSE_FLOAT(x4, x5, x6, x7, h0, h1, h2, h3) ++ TRANSPOSE_FLOAT(y4, y5, y6, y7, h4, h5, h6, h7) ++ DO_FDCT(h, h); ++ ++ /* Pass 2: process rows */ ++ ++ TRANSPOSE_FLOAT(l0, l1, l2, l3, x0, x1, x2, x3) ++ TRANSPOSE_FLOAT(h0, h1, h2, h3, x4, x5, x6, x7) ++ DO_FDCT(x, x); ++ STORE_DATA(0, 1, 2, 3, x, 0) ++ STORE_DATA(4, 5, 6, 7, x, 0) ++ ++ TRANSPOSE_FLOAT(l4, l5, l6, l7, y0, y1, y2, y3) ++ TRANSPOSE_FLOAT(h4, h5, h6, h7, y4, y5, y6, y7) ++ DO_FDCT(y, y); ++ STORE_DATA(0, 1, 2, 3, y, 4) ++ STORE_DATA(4, 5, 6, 7, y, 4) ++} +diff --git a/simd/e2k/jfdctfst-e2k.c b/simd/e2k/jfdctfst-e2k.c +new file mode 100644 +index 0000000..9e58f05 +--- /dev/null ++++ b/simd/e2k/jfdctfst-e2k.c +@@ -0,0 +1,145 @@ ++/* ++ * Elbrus optimizations for libjpeg-turbo ++ * ++ * Copyright (C) 2014, D. R. Commander. All Rights Reserved. ++ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd ++ * ++ * This software is provided 'as-is', without any express or implied ++ * warranty. In no event will the authors be held liable for any damages ++ * arising from the use of this software. ++ * ++ * Permission is granted to anyone to use this software for any purpose, ++ * including commercial applications, and to alter it and redistribute it ++ * freely, subject to the following restrictions: ++ * ++ * 1. The origin of this software must not be misrepresented; you must not ++ * claim that you wrote the original software. If you use this software ++ * in a product, an acknowledgment in the product documentation would be ++ * appreciated but is not required. ++ * 2. Altered source versions must be plainly marked as such, and must not be ++ * misrepresented as being the original software. ++ * 3. This notice may not be removed or altered from any source distribution. ++ */ ++ ++/* FAST INTEGER FORWARD DCT */ ++ ++#include "jsimd_e2k.h" ++ ++ ++#define F_0_382 98 /* FIX(0.382683433) */ ++#define F_0_541 139 /* FIX(0.541196100) */ ++#define F_0_707 181 /* FIX(0.707106781) */ ++#define F_1_306 334 /* FIX(1.306562965) */ ++ ++#define CONST_BITS 8 ++#define PRE_MULTIPLY_SCALE_BITS 2 ++#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) ++ ++ ++#define DO_FDCT() { \ ++ /* Even part */ \ ++ \ ++ tmp10 = _mm_add_epi16(tmp0, tmp3); \ ++ tmp13 = _mm_sub_epi16(tmp0, tmp3); \ ++ tmp11 = _mm_add_epi16(tmp1, tmp2); \ ++ tmp12 = _mm_sub_epi16(tmp1, tmp2); \ ++ \ ++ out0 = _mm_add_epi16(tmp10, tmp11); \ ++ out4 = _mm_sub_epi16(tmp10, tmp11); \ ++ \ ++ z1 = _mm_add_epi16(tmp12, tmp13); \ ++ z1 = _mm_slli_epi16(z1, PRE_MULTIPLY_SCALE_BITS); \ ++ z1 = _mm_mulhi_epi16(z1, pw_0707); \ ++ \ ++ out2 = _mm_add_epi16(tmp13, z1); \ ++ out6 = _mm_sub_epi16(tmp13, z1); \ ++ \ ++ /* Odd part */ \ ++ \ ++ tmp10 = _mm_add_epi16(tmp4, tmp5); \ ++ tmp11 = _mm_add_epi16(tmp5, tmp6); \ ++ tmp12 = _mm_add_epi16(tmp6, tmp7); \ ++ \ ++ tmp10 = _mm_slli_epi16(tmp10, PRE_MULTIPLY_SCALE_BITS); \ ++ tmp12 = _mm_slli_epi16(tmp12, PRE_MULTIPLY_SCALE_BITS); \ ++ z5 = _mm_sub_epi16(tmp10, tmp12); \ ++ z5 = _mm_mulhi_epi16(z5, pw_0382); \ ++ \ ++ z2 = _mm_add_epi16(_mm_mulhi_epi16(tmp10, pw_0541), z5); \ ++ z4 = _mm_add_epi16(_mm_mulhi_epi16(tmp12, pw_1306), z5); \ ++ \ ++ tmp11 = _mm_slli_epi16(tmp11, PRE_MULTIPLY_SCALE_BITS); \ ++ z3 = _mm_mulhi_epi16(tmp11, pw_0707); \ ++ \ ++ z11 = _mm_add_epi16(tmp7, z3); \ ++ z13 = _mm_sub_epi16(tmp7, z3); \ ++ \ ++ out5 = _mm_add_epi16(z13, z2); \ ++ out3 = _mm_sub_epi16(z13, z2); \ ++ out1 = _mm_add_epi16(z11, z4); \ ++ out7 = _mm_sub_epi16(z11, z4); \ ++} ++ ++ ++void jsimd_fdct_ifast_e2k(DCTELEM *data) ++{ ++ __m128i row0, row1, row2, row3, row4, row5, row6, row7, ++ col0, col1, col2, col3, col4, col5, col6, col7, ++ tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13, ++ z1, z2, z3, z4, z5, z11, z13, ++ out0, out1, out2, out3, out4, out5, out6, out7; ++ ++ /* Constants */ ++ __m128i pw_0382 = _mm_set1_epi16(F_0_382 << CONST_SHIFT), ++ pw_0541 = _mm_set1_epi16(F_0_541 << CONST_SHIFT), ++ pw_0707 = _mm_set1_epi16(F_0_707 << CONST_SHIFT), ++ pw_1306 = _mm_set1_epi16(F_1_306 << CONST_SHIFT); ++ ++ /* Pass 1: process rows */ ++ ++ row0 = VEC_LD(data + 0 * 8); ++ row1 = VEC_LD(data + 1 * 8); ++ row2 = VEC_LD(data + 2 * 8); ++ row3 = VEC_LD(data + 3 * 8); ++ row4 = VEC_LD(data + 4 * 8); ++ row5 = VEC_LD(data + 5 * 8); ++ row6 = VEC_LD(data + 6 * 8); ++ row7 = VEC_LD(data + 7 * 8); ++ ++ TRANSPOSE(row, col); ++ ++ tmp0 = _mm_add_epi16(col0, col7); ++ tmp7 = _mm_sub_epi16(col0, col7); ++ tmp1 = _mm_add_epi16(col1, col6); ++ tmp6 = _mm_sub_epi16(col1, col6); ++ tmp2 = _mm_add_epi16(col2, col5); ++ tmp5 = _mm_sub_epi16(col2, col5); ++ tmp3 = _mm_add_epi16(col3, col4); ++ tmp4 = _mm_sub_epi16(col3, col4); ++ ++ DO_FDCT(); ++ ++ /* Pass 2: process columns */ ++ ++ TRANSPOSE(out, row); ++ ++ tmp0 = _mm_add_epi16(row0, row7); ++ tmp7 = _mm_sub_epi16(row0, row7); ++ tmp1 = _mm_add_epi16(row1, row6); ++ tmp6 = _mm_sub_epi16(row1, row6); ++ tmp2 = _mm_add_epi16(row2, row5); ++ tmp5 = _mm_sub_epi16(row2, row5); ++ tmp3 = _mm_add_epi16(row3, row4); ++ tmp4 = _mm_sub_epi16(row3, row4); ++ ++ DO_FDCT(); ++ ++ VEC_ST(data + 0 * 8, out0); ++ VEC_ST(data + 1 * 8, out1); ++ VEC_ST(data + 2 * 8, out2); ++ VEC_ST(data + 3 * 8, out3); ++ VEC_ST(data + 4 * 8, out4); ++ VEC_ST(data + 5 * 8, out5); ++ VEC_ST(data + 6 * 8, out6); ++ VEC_ST(data + 7 * 8, out7); ++} +diff --git a/simd/e2k/jfdctint-e2k.c b/simd/e2k/jfdctint-e2k.c +new file mode 100644 +index 0000000..2200852 +--- /dev/null ++++ b/simd/e2k/jfdctint-e2k.c +@@ -0,0 +1,255 @@ ++/* ++ * Elbrus optimizations for libjpeg-turbo ++ * ++ * Copyright (C) 2014, 2020, D. R. Commander. All Rights Reserved. ++ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd ++ * ++ * This software is provided 'as-is', without any express or implied ++ * warranty. In no event will the authors be held liable for any damages ++ * arising from the use of this software. ++ * ++ * Permission is granted to anyone to use this software for any purpose, ++ * including commercial applications, and to alter it and redistribute it ++ * freely, subject to the following restrictions: ++ * ++ * 1. The origin of this software must not be misrepresented; you must not ++ * claim that you wrote the original software. If you use this software ++ * in a product, an acknowledgment in the product documentation would be ++ * appreciated but is not required. ++ * 2. Altered source versions must be plainly marked as such, and must not be ++ * misrepresented as being the original software. ++ * 3. This notice may not be removed or altered from any source distribution. ++ */ ++ ++/* ACCURATE INTEGER FORWARD DCT */ ++ ++#include "jsimd_e2k.h" ++ ++ ++#define F_0_298 2446 /* FIX(0.298631336) */ ++#define F_0_390 3196 /* FIX(0.390180644) */ ++#define F_0_541 4433 /* FIX(0.541196100) */ ++#define F_0_765 6270 /* FIX(0.765366865) */ ++#define F_0_899 7373 /* FIX(0.899976223) */ ++#define F_1_175 9633 /* FIX(1.175875602) */ ++#define F_1_501 12299 /* FIX(1.501321110) */ ++#define F_1_847 15137 /* FIX(1.847759065) */ ++#define F_1_961 16069 /* FIX(1.961570560) */ ++#define F_2_053 16819 /* FIX(2.053119869) */ ++#define F_2_562 20995 /* FIX(2.562915447) */ ++#define F_3_072 25172 /* FIX(3.072711026) */ ++ ++#define CONST_BITS 13 ++#define PASS1_BITS 2 ++#define DESCALE_P1 (CONST_BITS - PASS1_BITS) ++#define DESCALE_P2 (CONST_BITS + PASS1_BITS) ++ ++ ++#define DO_FDCT_COMMON(PASS) { \ ++ /* (Original) \ ++ * z1 = (tmp12 + tmp13) * 0.541196100; \ ++ * data2 = z1 + tmp13 * 0.765366865; \ ++ * data6 = z1 + tmp12 * -1.847759065; \ ++ * \ ++ * (This implementation) \ ++ * data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; \ ++ * data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); \ ++ */ \ ++ \ ++ tmp1312l = _mm_unpacklo_epi16(tmp13, tmp12); \ ++ tmp1312h = _mm_unpackhi_epi16(tmp13, tmp12); \ ++ \ ++ out2l = _mm_add_epi32(_mm_madd_epi16(tmp1312l, pw_f130_f054), pd_descale_p##PASS); \ ++ out2h = _mm_add_epi32(_mm_madd_epi16(tmp1312h, pw_f130_f054), pd_descale_p##PASS); \ ++ out6l = _mm_add_epi32(_mm_madd_epi16(tmp1312l, pw_f054_mf130), pd_descale_p##PASS); \ ++ out6h = _mm_add_epi32(_mm_madd_epi16(tmp1312h, pw_f054_mf130), pd_descale_p##PASS); \ ++ \ ++ out2l = _mm_srai_epi32(out2l, DESCALE_P##PASS); \ ++ out2h = _mm_srai_epi32(out2h, DESCALE_P##PASS); \ ++ out6l = _mm_srai_epi32(out6l, DESCALE_P##PASS); \ ++ out6h = _mm_srai_epi32(out6h, DESCALE_P##PASS); \ ++ \ ++ out2 = _mm_packs_epi32(out2l, out2h); \ ++ out6 = _mm_packs_epi32(out6l, out6h); \ ++ \ ++ /* Odd part */ \ ++ \ ++ z3 = _mm_add_epi16(tmp4, tmp6); \ ++ z4 = _mm_add_epi16(tmp5, tmp7); \ ++ \ ++ /* (Original) \ ++ * z5 = (z3 + z4) * 1.175875602; \ ++ * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \ ++ * z3 += z5; z4 += z5; \ ++ * \ ++ * (This implementation) \ ++ * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \ ++ * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \ ++ */ \ ++ \ ++ z34l = _mm_unpacklo_epi16(z3, z4); \ ++ z34h = _mm_unpackhi_epi16(z3, z4); \ ++ \ ++ z3l = _mm_add_epi32(_mm_madd_epi16(z34l, pw_mf078_f117), pd_descale_p##PASS); \ ++ z3h = _mm_add_epi32(_mm_madd_epi16(z34h, pw_mf078_f117), pd_descale_p##PASS); \ ++ z4l = _mm_add_epi32(_mm_madd_epi16(z34l, pw_f117_f078), pd_descale_p##PASS); \ ++ z4h = _mm_add_epi32(_mm_madd_epi16(z34h, pw_f117_f078), pd_descale_p##PASS); \ ++ \ ++ /* (Original) \ ++ * z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; \ ++ * tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; \ ++ * tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; \ ++ * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \ ++ * data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; \ ++ * data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; \ ++ * \ ++ * (This implementation) \ ++ * tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; \ ++ * tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; \ ++ * tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); \ ++ * tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); \ ++ * data7 = tmp4 + z3; data5 = tmp5 + z4; \ ++ * data3 = tmp6 + z3; data1 = tmp7 + z4; \ ++ */ \ ++ \ ++ tmp47l = _mm_unpacklo_epi16(tmp4, tmp7); \ ++ tmp47h = _mm_unpackhi_epi16(tmp4, tmp7); \ ++ \ ++ out7l = _mm_add_epi32(_mm_madd_epi16(tmp47l, pw_mf060_mf089), z3l); \ ++ out7h = _mm_add_epi32(_mm_madd_epi16(tmp47h, pw_mf060_mf089), z3h); \ ++ out1l = _mm_add_epi32(_mm_madd_epi16(tmp47l, pw_mf089_f060), z4l); \ ++ out1h = _mm_add_epi32(_mm_madd_epi16(tmp47h, pw_mf089_f060), z4h); \ ++ \ ++ out7l = _mm_srai_epi32(out7l, DESCALE_P##PASS); \ ++ out7h = _mm_srai_epi32(out7h, DESCALE_P##PASS); \ ++ out1l = _mm_srai_epi32(out1l, DESCALE_P##PASS); \ ++ out1h = _mm_srai_epi32(out1h, DESCALE_P##PASS); \ ++ \ ++ out7 = _mm_packs_epi32(out7l, out7h); \ ++ out1 = _mm_packs_epi32(out1l, out1h); \ ++ \ ++ tmp56l = _mm_unpacklo_epi16(tmp5, tmp6); \ ++ tmp56h = _mm_unpackhi_epi16(tmp5, tmp6); \ ++ \ ++ out5l = _mm_add_epi32(_mm_madd_epi16(tmp56l, pw_mf050_mf256), z4l); \ ++ out5h = _mm_add_epi32(_mm_madd_epi16(tmp56h, pw_mf050_mf256), z4h); \ ++ out3l = _mm_add_epi32(_mm_madd_epi16(tmp56l, pw_mf256_f050), z3l); \ ++ out3h = _mm_add_epi32(_mm_madd_epi16(tmp56h, pw_mf256_f050), z3h); \ ++ \ ++ out5l = _mm_srai_epi32(out5l, DESCALE_P##PASS); \ ++ out5h = _mm_srai_epi32(out5h, DESCALE_P##PASS); \ ++ out3l = _mm_srai_epi32(out3l, DESCALE_P##PASS); \ ++ out3h = _mm_srai_epi32(out3h, DESCALE_P##PASS); \ ++ \ ++ out5 = _mm_packs_epi32(out5l, out5h); \ ++ out3 = _mm_packs_epi32(out3l, out3h); \ ++} ++ ++#define DO_FDCT_PASS1() { \ ++ /* Even part */ \ ++ \ ++ tmp10 = _mm_add_epi16(tmp0, tmp3); \ ++ tmp13 = _mm_sub_epi16(tmp0, tmp3); \ ++ tmp11 = _mm_add_epi16(tmp1, tmp2); \ ++ tmp12 = _mm_sub_epi16(tmp1, tmp2); \ ++ \ ++ out0 = _mm_add_epi16(tmp10, tmp11); \ ++ out0 = _mm_slli_epi16(out0, PASS1_BITS); \ ++ out4 = _mm_sub_epi16(tmp10, tmp11); \ ++ out4 = _mm_slli_epi16(out4, PASS1_BITS); \ ++ \ ++ DO_FDCT_COMMON(1); \ ++} ++ ++#define DO_FDCT_PASS2() { \ ++ /* Even part */ \ ++ \ ++ tmp10 = _mm_add_epi16(tmp0, tmp3); \ ++ tmp13 = _mm_sub_epi16(tmp0, tmp3); \ ++ tmp11 = _mm_add_epi16(tmp1, tmp2); \ ++ tmp12 = _mm_sub_epi16(tmp1, tmp2); \ ++ \ ++ out0 = _mm_add_epi16(tmp10, tmp11); \ ++ out0 = _mm_add_epi16(out0, pw_descale_p2x); \ ++ out0 = _mm_srai_epi16(out0, PASS1_BITS); \ ++ out4 = _mm_sub_epi16(tmp10, tmp11); \ ++ out4 = _mm_add_epi16(out4, pw_descale_p2x); \ ++ out4 = _mm_srai_epi16(out4, PASS1_BITS); \ ++ \ ++ DO_FDCT_COMMON(2); \ ++} ++ ++ ++void jsimd_fdct_islow_e2k(DCTELEM *data) ++{ ++ __m128i row0, row1, row2, row3, row4, row5, row6, row7, ++ col0, col1, col2, col3, col4, col5, col6, col7, ++ tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13, ++ tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h, ++ z3, z4, z34l, z34h, ++ out0, out1, out2, out3, out4, out5, out6, out7; ++ __m128i z3l, z3h, z4l, z4h, ++ out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h, ++ out7l, out7h; ++ ++ /* Constants */ ++ __m128i pw_f130_f054 = _mm_setr_epi16(__4X2(F_0_541 + F_0_765, F_0_541)), ++ pw_f054_mf130 = _mm_setr_epi16(__4X2(F_0_541, F_0_541 - F_1_847)), ++ pw_mf078_f117 = _mm_setr_epi16(__4X2(F_1_175 - F_1_961, F_1_175)), ++ pw_f117_f078 = _mm_setr_epi16(__4X2(F_1_175, F_1_175 - F_0_390)), ++ pw_mf060_mf089 = _mm_setr_epi16(__4X2(F_0_298 - F_0_899, -F_0_899)), ++ pw_mf089_f060 = _mm_setr_epi16(__4X2(-F_0_899, F_1_501 - F_0_899)), ++ pw_mf050_mf256 = _mm_setr_epi16(__4X2(F_2_053 - F_2_562, -F_2_562)), ++ pw_mf256_f050 = _mm_setr_epi16(__4X2(-F_2_562, F_3_072 - F_2_562)), ++ pw_descale_p2x = _mm_set1_epi16(1 << (PASS1_BITS - 1)), ++ pd_descale_p1 = _mm_set1_epi32(1 << (DESCALE_P1 - 1)), ++ pd_descale_p2 = _mm_set1_epi32(1 << (DESCALE_P2 - 1)); ++ ++ /* Pass 1: process rows */ ++ ++ row0 = VEC_LD(data + 0 * 8); ++ row1 = VEC_LD(data + 1 * 8); ++ row2 = VEC_LD(data + 2 * 8); ++ row3 = VEC_LD(data + 3 * 8); ++ row4 = VEC_LD(data + 4 * 8); ++ row5 = VEC_LD(data + 5 * 8); ++ row6 = VEC_LD(data + 6 * 8); ++ row7 = VEC_LD(data + 7 * 8); ++ ++ TRANSPOSE(row, col); ++ ++ tmp0 = _mm_add_epi16(col0, col7); ++ tmp7 = _mm_sub_epi16(col0, col7); ++ tmp1 = _mm_add_epi16(col1, col6); ++ tmp6 = _mm_sub_epi16(col1, col6); ++ tmp2 = _mm_add_epi16(col2, col5); ++ tmp5 = _mm_sub_epi16(col2, col5); ++ tmp3 = _mm_add_epi16(col3, col4); ++ tmp4 = _mm_sub_epi16(col3, col4); ++ ++ DO_FDCT_PASS1(); ++ ++ /* Pass 2: process columns */ ++ ++ TRANSPOSE(out, row); ++ ++ tmp0 = _mm_add_epi16(row0, row7); ++ tmp7 = _mm_sub_epi16(row0, row7); ++ tmp1 = _mm_add_epi16(row1, row6); ++ tmp6 = _mm_sub_epi16(row1, row6); ++ tmp2 = _mm_add_epi16(row2, row5); ++ tmp5 = _mm_sub_epi16(row2, row5); ++ tmp3 = _mm_add_epi16(row3, row4); ++ tmp4 = _mm_sub_epi16(row3, row4); ++ ++ DO_FDCT_PASS2(); ++ ++ VEC_ST(data + 0 * 8, out0); ++ VEC_ST(data + 1 * 8, out1); ++ VEC_ST(data + 2 * 8, out2); ++ VEC_ST(data + 3 * 8, out3); ++ VEC_ST(data + 4 * 8, out4); ++ VEC_ST(data + 5 * 8, out5); ++ VEC_ST(data + 6 * 8, out6); ++ VEC_ST(data + 7 * 8, out7); ++} +diff --git a/simd/e2k/jidctflt-e2k.c b/simd/e2k/jidctflt-e2k.c +new file mode 100644 +index 0000000..7682965 +--- /dev/null ++++ b/simd/e2k/jidctflt-e2k.c +@@ -0,0 +1,215 @@ ++/* ++ * Elbrus optimizations for libjpeg-turbo ++ * ++ * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. ++ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd ++ * ++ * This software is provided 'as-is', without any express or implied ++ * warranty. In no event will the authors be held liable for any damages ++ * arising from the use of this software. ++ * ++ * Permission is granted to anyone to use this software for any purpose, ++ * including commercial applications, and to alter it and redistribute it ++ * freely, subject to the following restrictions: ++ * ++ * 1. The origin of this software must not be misrepresented; you must not ++ * claim that you wrote the original software. If you use this software ++ * in a product, an acknowledgment in the product documentation would be ++ * appreciated but is not required. ++ * 2. Altered source versions must be plainly marked as such, and must not be ++ * misrepresented as being the original software. ++ * 3. This notice may not be removed or altered from any source distribution. ++ */ ++ ++/* FLOAT INVERSE DCT */ ++ ++#include "jsimd_e2k.h" ++ ++#define DO_IDCT(in, out) { \ ++ /* Even part */ \ ++ \ ++ tmp10 = _mm_add_ps(in##0, in##4); \ ++ tmp11 = _mm_sub_ps(in##0, in##4); \ ++ \ ++ tmp13 = _mm_add_ps(in##2, in##6); \ ++ tmp12 = _mm_sub_ps(in##2, in##6); \ ++ tmp12 = _mm_sub_ps(_mm_mul_ps(tmp12, pd_f1414), tmp13); \ ++ \ ++ tmp0 = _mm_add_ps(tmp10, tmp13); \ ++ tmp3 = _mm_sub_ps(tmp10, tmp13); \ ++ tmp1 = _mm_add_ps(tmp11, tmp12); \ ++ tmp2 = _mm_sub_ps(tmp11, tmp12); \ ++ \ ++ /* Odd part */ \ ++ \ ++ z13 = _mm_add_ps(in##5, in##3); \ ++ z10 = _mm_sub_ps(in##5, in##3); \ ++ z11 = _mm_add_ps(in##1, in##7); \ ++ z12 = _mm_sub_ps(in##1, in##7); \ ++ \ ++ tmp7 = _mm_add_ps(z11, z13); \ ++ tmp11 = _mm_sub_ps(z11, z13); \ ++ tmp11 = _mm_mul_ps(tmp11, pd_f1414); \ ++ \ ++ z5 = _mm_mul_ps(_mm_add_ps(z10, z12), pd_f1847); \ ++ tmp10 = _mm_sub_ps(z5, _mm_mul_ps(z12, pd_f1082)); \ ++ tmp12 = _mm_sub_ps(z5, _mm_mul_ps(z10, pd_f2613)); \ ++ \ ++ tmp6 = _mm_sub_ps(tmp12, tmp7); \ ++ tmp5 = _mm_sub_ps(tmp11, tmp6); \ ++ tmp4 = _mm_sub_ps(tmp10, tmp5); \ ++ \ ++ out##0 = _mm_add_ps(tmp0, tmp7); \ ++ out##7 = _mm_sub_ps(tmp0, tmp7); \ ++ out##1 = _mm_add_ps(tmp1, tmp6); \ ++ out##6 = _mm_sub_ps(tmp1, tmp6); \ ++ out##2 = _mm_add_ps(tmp2, tmp5); \ ++ out##5 = _mm_sub_ps(tmp2, tmp5); \ ++ out##3 = _mm_add_ps(tmp3, tmp4); \ ++ out##4 = _mm_sub_ps(tmp3, tmp4); \ ++} ++ ++#define QUANT_MUL(a, b, c, d, l, lo, i) \ ++ out0 = _mm_srai_epi32(_mm_unpack##lo##_epi16(col##a, col##a), 16); \ ++ out1 = _mm_srai_epi32(_mm_unpack##lo##_epi16(col##b, col##b), 16); \ ++ out2 = _mm_srai_epi32(_mm_unpack##lo##_epi16(col##c, col##c), 16); \ ++ out3 = _mm_srai_epi32(_mm_unpack##lo##_epi16(col##d, col##d), 16); \ ++ l##a = _mm_cvtepi32_ps(out0); \ ++ l##b = _mm_cvtepi32_ps(out1); \ ++ l##c = _mm_cvtepi32_ps(out2); \ ++ l##d = _mm_cvtepi32_ps(out3); \ ++ l##a = _mm_mul_ps(l##a, _mm_load_ps(dct_table + a * 8 + i)); \ ++ l##b = _mm_mul_ps(l##b, _mm_load_ps(dct_table + b * 8 + i)); \ ++ l##c = _mm_mul_ps(l##c, _mm_load_ps(dct_table + c * 8 + i)); \ ++ l##d = _mm_mul_ps(l##d, _mm_load_ps(dct_table + d * 8 + i)); ++ ++ ++void jsimd_idct_float_e2k(void *dct_table_, JCOEFPTR coef_block, ++ JSAMPARRAY output_buf, JDIMENSION output_col) ++{ ++ float *dct_table = (float *)dct_table_; ++ ++ __m128i col0, col1, col2, col3, col4, col5, col6, col7, ++ out0, out1, out2, out3, out4, out5, out6, out7, row0, row1, row2, row3; ++ __m128 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, ++ tmp10, tmp11, tmp12, tmp13, z5, z10, z11, z12, z13; ++ __m128 l0, l1, l2, l3, l4, l5, l6, l7; ++ __m128 h0, h1, h2, h3, h4, h5, h6, h7; ++ __m128 x0, x1, x2, x3, x4, x5, x6, x7; ++ __m128 y0, y1, y2, y3, y4, y5, y6, y7; ++ ++ /* Constants */ ++ __m128 pd_f1414 = _mm_set1_ps(1.414213562f), ++ pd_f1847 = _mm_set1_ps(1.847759065f), ++ pd_f1082 = _mm_set1_ps(1.082392200f), ++ pd_f2613 = _mm_set1_ps(2.613125930f); ++ ++ /* Pass 1: process columns */ ++ ++ col0 = VEC_LD(coef_block + 0 * 8); ++ col1 = VEC_LD(coef_block + 1 * 8); ++ col2 = VEC_LD(coef_block + 2 * 8); ++ col3 = VEC_LD(coef_block + 3 * 8); ++ col4 = VEC_LD(coef_block + 4 * 8); ++ col5 = VEC_LD(coef_block + 5 * 8); ++ col6 = VEC_LD(coef_block + 6 * 8); ++ col7 = VEC_LD(coef_block + 7 * 8); ++ ++ out1 = _mm_or_si128(col1, col2); ++ out2 = _mm_or_si128(col3, col4); ++ out1 = _mm_or_si128(out1, out2); ++ out3 = _mm_or_si128(col5, col6); ++ out3 = _mm_or_si128(out3, col7); ++ out1 = _mm_or_si128(out1, out3); ++ ++ if (VEC_ISZERO(out1)) { ++ /* AC terms all zero */ ++ ++ out0 = _mm_srai_epi32(_mm_unpacklo_epi16(col0, col0), 16); ++ out1 = _mm_srai_epi32(_mm_unpackhi_epi16(col0, col0), 16); ++ tmp0 = _mm_cvtepi32_ps(out0); ++ tmp1 = _mm_cvtepi32_ps(out1); ++ tmp0 = _mm_mul_ps(tmp0, _mm_load_ps(dct_table)); ++ tmp1 = _mm_mul_ps(tmp1, _mm_load_ps(dct_table + 4)); ++ ++ l0 = h0 = _mm_shuffle_ps(tmp0, tmp0, 0x00); ++ l1 = h1 = _mm_shuffle_ps(tmp0, tmp0, 0x55); ++ l2 = h2 = _mm_shuffle_ps(tmp0, tmp0, 0xaa); ++ l3 = h3 = _mm_shuffle_ps(tmp0, tmp0, 0xff); ++ l4 = h4 = _mm_shuffle_ps(tmp1, tmp1, 0x00); ++ l5 = h5 = _mm_shuffle_ps(tmp1, tmp1, 0x55); ++ l6 = h6 = _mm_shuffle_ps(tmp1, tmp1, 0xaa); ++ l7 = h7 = _mm_shuffle_ps(tmp1, tmp1, 0xff); ++ ++ } else { ++ ++ QUANT_MUL(0, 2, 4, 6, l, lo, 0) ++ QUANT_MUL(1, 3, 5, 7, l, lo, 0) ++ DO_IDCT(l, x); ++ ++ QUANT_MUL(0, 2, 4, 6, h, hi, 4) ++ QUANT_MUL(1, 3, 5, 7, h, hi, 4) ++ DO_IDCT(h, y); ++ ++ TRANSPOSE_FLOAT(x0, x1, x2, x3, l0, l1, l2, l3) ++ TRANSPOSE_FLOAT(x4, x5, x6, x7, h0, h1, h2, h3) ++ TRANSPOSE_FLOAT(y0, y1, y2, y3, l4, l5, l6, l7) ++ TRANSPOSE_FLOAT(y4, y5, y6, y7, h4, h5, h6, h7) ++ } ++ ++ /* Pass 2: process rows */ ++ ++ DO_IDCT(l, x); ++ DO_IDCT(h, y); ++ ++#ifdef JSIMD_SAME_ROUNDING ++#define OUT_ROUND(i) \ ++ tmp0 = _mm_add_ps(_mm_mul_ps(x##i, pd_f0125), pd_cj_rnd); \ ++ tmp1 = _mm_add_ps(_mm_mul_ps(y##i, pd_f0125), pd_cj_rnd); \ ++ out##i = _mm_packs_epi32(_mm_cvttps_epi32(tmp0), _mm_cvttps_epi32(tmp1)); ++ ++ { ++ __m128 pd_cj_rnd = _mm_set1_ps(0.5f + CENTERJSAMPLE), ++ pd_f0125 = _mm_set1_ps(0.125f); ++ ++ OUT_ROUND(0) OUT_ROUND(1) ++ OUT_ROUND(2) OUT_ROUND(3) ++ OUT_ROUND(4) OUT_ROUND(5) ++ OUT_ROUND(6) OUT_ROUND(7) ++ } ++ row0 = _mm_packus_epi16(out0, out1); ++ row1 = _mm_packus_epi16(out2, out3); ++ row2 = _mm_packus_epi16(out4, out5); ++ row3 = _mm_packus_epi16(out6, out7); ++ ++ TRANSPOSE8(row, col) TRANSPOSE8(col, row) TRANSPOSE8(row, col) ++#else /* faster, slightly differ in rounding */ ++#define OUT_ROUND(i, a, b) out##i = _mm_blendv_epi8( \ ++ _mm_slli_epi32(_mm_castps_si128(_mm_add_ps(b, pd_round)), 16), \ ++ _mm_castps_si128(_mm_add_ps(a, pd_round)), pd_mask); ++ ++ { ++ __m128i pd_mask = _mm_set1_epi32(0xffff); ++ __m128 pd_round = _mm_set1_ps((3 << 22 | CENTERJSAMPLE) * 8); ++ ++ OUT_ROUND(0, x0, x4) OUT_ROUND(1, y0, y4) ++ OUT_ROUND(2, x1, x5) OUT_ROUND(3, y1, y5) ++ OUT_ROUND(4, x2, x6) OUT_ROUND(5, y2, y6) ++ OUT_ROUND(6, x3, x7) OUT_ROUND(7, y3, y7) ++ } ++ row0 = _mm_packus_epi16(out0, out1); ++ row1 = _mm_packus_epi16(out2, out3); ++ row2 = _mm_packus_epi16(out4, out5); ++ row3 = _mm_packus_epi16(out6, out7); ++ ++ TRANSPOSE8(row, out) TRANSPOSE8(out, col) ++#endif ++ VEC_STL(output_buf[0] + output_col, col0); ++ VEC_STH(output_buf[1] + output_col, col0); ++ VEC_STL(output_buf[2] + output_col, col1); ++ VEC_STH(output_buf[3] + output_col, col1); ++ VEC_STL(output_buf[4] + output_col, col2); ++ VEC_STH(output_buf[5] + output_col, col2); ++ VEC_STL(output_buf[6] + output_col, col3); ++ VEC_STH(output_buf[7] + output_col, col3); ++} +diff --git a/simd/e2k/jidctfst-e2k.c b/simd/e2k/jidctfst-e2k.c +new file mode 100644 +index 0000000..18bc425 +--- /dev/null ++++ b/simd/e2k/jidctfst-e2k.c +@@ -0,0 +1,187 @@ ++/* ++ * Elbrus optimizations for libjpeg-turbo ++ * ++ * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. ++ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd ++ * ++ * This software is provided 'as-is', without any express or implied ++ * warranty. In no event will the authors be held liable for any damages ++ * arising from the use of this software. ++ * ++ * Permission is granted to anyone to use this software for any purpose, ++ * including commercial applications, and to alter it and redistribute it ++ * freely, subject to the following restrictions: ++ * ++ * 1. The origin of this software must not be misrepresented; you must not ++ * claim that you wrote the original software. If you use this software ++ * in a product, an acknowledgment in the product documentation would be ++ * appreciated but is not required. ++ * 2. Altered source versions must be plainly marked as such, and must not be ++ * misrepresented as being the original software. ++ * 3. This notice may not be removed or altered from any source distribution. ++ */ ++ ++/* FAST INTEGER INVERSE DCT */ ++ ++#include "jsimd_e2k.h" ++ ++ ++#define F_1_082 277 /* FIX(1.082392200) */ ++#define F_1_414 362 /* FIX(1.414213562) */ ++#define F_1_847 473 /* FIX(1.847759065) */ ++#define F_2_613 669 /* FIX(2.613125930) */ ++#define F_1_613 (F_2_613 - 256) /* FIX(2.613125930) - FIX(1) */ ++ ++#define CONST_BITS 8 ++#define PASS1_BITS 2 ++#define PRE_MULTIPLY_SCALE_BITS 2 ++#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) ++ ++ ++#define DO_IDCT(in) { \ ++ /* Even part */ \ ++ \ ++ tmp10 = _mm_add_epi16(in##0, in##4); \ ++ tmp11 = _mm_sub_epi16(in##0, in##4); \ ++ tmp13 = _mm_add_epi16(in##2, in##6); \ ++ \ ++ tmp12 = _mm_sub_epi16(in##2, in##6); \ ++ tmp12 = _mm_slli_epi16(tmp12, PRE_MULTIPLY_SCALE_BITS); \ ++ tmp12 = _mm_mulhi_epi16(tmp12, pw_F1414); \ ++ tmp12 = _mm_sub_epi16(tmp12, tmp13); \ ++ \ ++ tmp0 = _mm_add_epi16(tmp10, tmp13); \ ++ tmp3 = _mm_sub_epi16(tmp10, tmp13); \ ++ tmp1 = _mm_add_epi16(tmp11, tmp12); \ ++ tmp2 = _mm_sub_epi16(tmp11, tmp12); \ ++ \ ++ /* Odd part */ \ ++ \ ++ z13 = _mm_add_epi16(in##5, in##3); \ ++ z10 = _mm_sub_epi16(in##5, in##3); \ ++ z10s = _mm_slli_epi16(z10, PRE_MULTIPLY_SCALE_BITS); \ ++ z11 = _mm_add_epi16(in##1, in##7); \ ++ z12s = _mm_sub_epi16(in##1, in##7); \ ++ z12s = _mm_slli_epi16(z12s, PRE_MULTIPLY_SCALE_BITS); \ ++ \ ++ tmp11 = _mm_sub_epi16(z11, z13); \ ++ tmp11 = _mm_slli_epi16(tmp11, PRE_MULTIPLY_SCALE_BITS); \ ++ tmp11 = _mm_mulhi_epi16(tmp11, pw_F1414); \ ++ \ ++ tmp7 = _mm_add_epi16(z11, z13); \ ++ \ ++ /* To avoid overflow... \ ++ * \ ++ * (Original) \ ++ * tmp12 = -2.613125930 * z10 + z5; \ ++ * \ ++ * (This implementation) \ ++ * tmp12 = (-1.613125930 - 1) * z10 + z5; \ ++ * = -1.613125930 * z10 - z10 + z5; \ ++ */ \ ++ \ ++ z5 = _mm_add_epi16(z10s, z12s); \ ++ z5 = _mm_mulhi_epi16(z5, pw_F1847); \ ++ \ ++ tmp10 = _mm_mulhi_epi16(z12s, pw_F1082); \ ++ tmp10 = _mm_sub_epi16(tmp10, z5); \ ++ tmp12 = _mm_add_epi16(_mm_mulhi_epi16(z10s, pw_MF1613), z5); \ ++ tmp12 = _mm_sub_epi16(tmp12, z10); \ ++ \ ++ tmp6 = _mm_sub_epi16(tmp12, tmp7); \ ++ tmp5 = _mm_sub_epi16(tmp11, tmp6); \ ++ tmp4 = _mm_add_epi16(tmp10, tmp5); \ ++ \ ++ out0 = _mm_add_epi16(tmp0, tmp7); \ ++ out1 = _mm_add_epi16(tmp1, tmp6); \ ++ out2 = _mm_add_epi16(tmp2, tmp5); \ ++ out3 = _mm_sub_epi16(tmp3, tmp4); \ ++ out4 = _mm_add_epi16(tmp3, tmp4); \ ++ out5 = _mm_sub_epi16(tmp2, tmp5); \ ++ out6 = _mm_sub_epi16(tmp1, tmp6); \ ++ out7 = _mm_sub_epi16(tmp0, tmp7); \ ++} ++ ++ ++void jsimd_idct_ifast_e2k(void *dct_table_, JCOEFPTR coef_block, ++ JSAMPARRAY output_buf, JDIMENSION output_col) ++{ ++ short *dct_table = (short *)dct_table_; ++ ++ __m128i row0, row1, row2, row3, row4, row5, row6, row7, ++ col0, col1, col2, col3, col4, col5, col6, col7, ++ quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7, ++ tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13, ++ z5, z10, z10s, z11, z12s, z13, ++ out0, out1, out2, out3, out4, out5, out6, out7; ++ ++ /* Constants */ ++ __m128i pw_F1414 = _mm_set1_epi16(F_1_414 << CONST_SHIFT), ++ pw_F1847 = _mm_set1_epi16(F_1_847 << CONST_SHIFT), ++ pw_MF1613 = _mm_set1_epi16(-F_1_613 << CONST_SHIFT), ++ pw_F1082 = _mm_set1_epi16(F_1_082 << CONST_SHIFT); ++ ++ /* Pass 1: process columns */ ++ ++ col0 = VEC_LD(coef_block + 0 * 8); ++ col1 = VEC_LD(coef_block + 1 * 8); ++ col2 = VEC_LD(coef_block + 2 * 8); ++ col3 = VEC_LD(coef_block + 3 * 8); ++ col4 = VEC_LD(coef_block + 4 * 8); ++ col5 = VEC_LD(coef_block + 5 * 8); ++ col6 = VEC_LD(coef_block + 6 * 8); ++ col7 = VEC_LD(coef_block + 7 * 8); ++ ++ tmp1 = _mm_or_si128(col1, col2); ++ tmp2 = _mm_or_si128(col3, col4); ++ tmp1 = _mm_or_si128(tmp1, tmp2); ++ tmp3 = _mm_or_si128(col5, col6); ++ tmp3 = _mm_or_si128(tmp3, col7); ++ tmp1 = _mm_or_si128(tmp1, tmp3); ++ ++ quant0 = VEC_LD(dct_table); ++ col0 = _mm_mullo_epi16(col0, quant0); ++ ++ if (VEC_ISZERO(tmp1)) { ++ /* AC terms all zero */ ++ ++ IDCT_SPLAT8(col0); ++ ++ } else { ++ ++ quant1 = VEC_LD(dct_table + 1 * 8); ++ quant2 = VEC_LD(dct_table + 2 * 8); ++ quant3 = VEC_LD(dct_table + 3 * 8); ++ quant4 = VEC_LD(dct_table + 4 * 8); ++ quant5 = VEC_LD(dct_table + 5 * 8); ++ quant6 = VEC_LD(dct_table + 6 * 8); ++ quant7 = VEC_LD(dct_table + 7 * 8); ++ ++ col1 = _mm_mullo_epi16(col1, quant1); ++ col2 = _mm_mullo_epi16(col2, quant2); ++ col3 = _mm_mullo_epi16(col3, quant3); ++ col4 = _mm_mullo_epi16(col4, quant4); ++ col5 = _mm_mullo_epi16(col5, quant5); ++ col6 = _mm_mullo_epi16(col6, quant6); ++ col7 = _mm_mullo_epi16(col7, quant7); ++ ++ DO_IDCT(col); ++ ++ TRANSPOSE(out, row); ++ } ++ ++ /* Pass 2: process rows */ ++ ++ DO_IDCT(row); ++ ++ out0 = _mm_srai_epi16(out0, PASS1_BITS + 3); ++ out1 = _mm_srai_epi16(out1, PASS1_BITS + 3); ++ out2 = _mm_srai_epi16(out2, PASS1_BITS + 3); ++ out3 = _mm_srai_epi16(out3, PASS1_BITS + 3); ++ out4 = _mm_srai_epi16(out4, PASS1_BITS + 3); ++ out5 = _mm_srai_epi16(out5, PASS1_BITS + 3); ++ out6 = _mm_srai_epi16(out6, PASS1_BITS + 3); ++ out7 = _mm_srai_epi16(out7, PASS1_BITS + 3); ++ ++ IDCT_SAVE(); ++} +diff --git a/simd/e2k/jidctint-e2k.c b/simd/e2k/jidctint-e2k.c +new file mode 100644 +index 0000000..7bb79c0 +--- /dev/null ++++ b/simd/e2k/jidctint-e2k.c +@@ -0,0 +1,294 @@ ++/* ++ * Elbrus optimizations for libjpeg-turbo ++ * ++ * Copyright (C) 2014-2015, 2020, D. R. Commander. All Rights Reserved. ++ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd ++ * ++ * This software is provided 'as-is', without any express or implied ++ * warranty. In no event will the authors be held liable for any damages ++ * arising from the use of this software. ++ * ++ * Permission is granted to anyone to use this software for any purpose, ++ * including commercial applications, and to alter it and redistribute it ++ * freely, subject to the following restrictions: ++ * ++ * 1. The origin of this software must not be misrepresented; you must not ++ * claim that you wrote the original software. If you use this software ++ * in a product, an acknowledgment in the product documentation would be ++ * appreciated but is not required. ++ * 2. Altered source versions must be plainly marked as such, and must not be ++ * misrepresented as being the original software. ++ * 3. This notice may not be removed or altered from any source distribution. ++ */ ++ ++/* ACCURATE INTEGER INVERSE DCT */ ++ ++#include "jsimd_e2k.h" ++ ++ ++#define F_0_298 2446 /* FIX(0.298631336) */ ++#define F_0_390 3196 /* FIX(0.390180644) */ ++#define F_0_541 4433 /* FIX(0.541196100) */ ++#define F_0_765 6270 /* FIX(0.765366865) */ ++#define F_0_899 7373 /* FIX(0.899976223) */ ++#define F_1_175 9633 /* FIX(1.175875602) */ ++#define F_1_501 12299 /* FIX(1.501321110) */ ++#define F_1_847 15137 /* FIX(1.847759065) */ ++#define F_1_961 16069 /* FIX(1.961570560) */ ++#define F_2_053 16819 /* FIX(2.053119869) */ ++#define F_2_562 20995 /* FIX(2.562915447) */ ++#define F_3_072 25172 /* FIX(3.072711026) */ ++ ++#define CONST_BITS 13 ++#define PASS1_BITS 2 ++#define DESCALE_P1 (CONST_BITS - PASS1_BITS) ++#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3) ++ ++ ++#define DO_IDCT(in, PASS) { \ ++ /* Even part \ ++ * \ ++ * (Original) \ ++ * z1 = (z2 + z3) * 0.541196100; \ ++ * tmp2 = z1 + z3 * -1.847759065; \ ++ * tmp3 = z1 + z2 * 0.765366865; \ ++ * \ ++ * (This implementation) \ ++ * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \ ++ * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \ ++ */ \ ++ \ ++ in##26l = _mm_unpacklo_epi16(in##2, in##6); \ ++ in##26h = _mm_unpackhi_epi16(in##2, in##6); \ ++ \ ++ tmp3l = _mm_madd_epi16(in##26l, pw_f130_f054); \ ++ tmp3h = _mm_madd_epi16(in##26h, pw_f130_f054); \ ++ tmp2l = _mm_madd_epi16(in##26l, pw_f054_mf130); \ ++ tmp2h = _mm_madd_epi16(in##26h, pw_f054_mf130); \ ++ \ ++ tmp0 = _mm_add_epi16(in##0, in##4); \ ++ tmp1 = _mm_sub_epi16(in##0, in##4); \ ++ \ ++ tmp0l = _mm_unpacklo_epi16(pw_zero, tmp0); \ ++ tmp0h = _mm_unpackhi_epi16(pw_zero, tmp0); \ ++ tmp0l = _mm_srai_epi32(tmp0l, 16 - CONST_BITS); \ ++ tmp0h = _mm_srai_epi32(tmp0h, 16 - CONST_BITS); \ ++ tmp0l = _mm_add_epi32(tmp0l, pd_descale_p##PASS); \ ++ tmp0h = _mm_add_epi32(tmp0h, pd_descale_p##PASS); \ ++ \ ++ tmp10l = _mm_add_epi32(tmp0l, tmp3l); \ ++ tmp10h = _mm_add_epi32(tmp0h, tmp3h); \ ++ tmp13l = _mm_sub_epi32(tmp0l, tmp3l); \ ++ tmp13h = _mm_sub_epi32(tmp0h, tmp3h); \ ++ \ ++ tmp1l = _mm_unpacklo_epi16(pw_zero, tmp1); \ ++ tmp1h = _mm_unpackhi_epi16(pw_zero, tmp1); \ ++ tmp1l = _mm_srai_epi32(tmp1l, 16 - CONST_BITS); \ ++ tmp1h = _mm_srai_epi32(tmp1h, 16 - CONST_BITS); \ ++ tmp1l = _mm_add_epi32(tmp1l, pd_descale_p##PASS); \ ++ tmp1h = _mm_add_epi32(tmp1h, pd_descale_p##PASS); \ ++ \ ++ tmp11l = _mm_add_epi32(tmp1l, tmp2l); \ ++ tmp11h = _mm_add_epi32(tmp1h, tmp2h); \ ++ tmp12l = _mm_sub_epi32(tmp1l, tmp2l); \ ++ tmp12h = _mm_sub_epi32(tmp1h, tmp2h); \ ++ \ ++ /* Odd part */ \ ++ \ ++ z3 = _mm_add_epi16(in##3, in##7); \ ++ z4 = _mm_add_epi16(in##1, in##5); \ ++ \ ++ /* (Original) \ ++ * z5 = (z3 + z4) * 1.175875602; \ ++ * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \ ++ * z3 += z5; z4 += z5; \ ++ * \ ++ * (This implementation) \ ++ * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \ ++ * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \ ++ */ \ ++ \ ++ z34l = _mm_unpacklo_epi16(z3, z4); \ ++ z34h = _mm_unpackhi_epi16(z3, z4); \ ++ \ ++ z3l = _mm_madd_epi16(z34l, pw_mf078_f117); \ ++ z3h = _mm_madd_epi16(z34h, pw_mf078_f117); \ ++ z4l = _mm_madd_epi16(z34l, pw_f117_f078); \ ++ z4h = _mm_madd_epi16(z34h, pw_f117_f078); \ ++ \ ++ /* (Original) \ ++ * z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; \ ++ * tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; \ ++ * tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; \ ++ * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \ ++ * tmp0 += z1 + z3; tmp1 += z2 + z4; \ ++ * tmp2 += z2 + z3; tmp3 += z1 + z4; \ ++ * \ ++ * (This implementation) \ ++ * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; \ ++ * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; \ ++ * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); \ ++ * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); \ ++ * tmp0 += z3; tmp1 += z4; \ ++ * tmp2 += z3; tmp3 += z4; \ ++ */ \ ++ \ ++ in##71l = _mm_unpacklo_epi16(in##7, in##1); \ ++ in##71h = _mm_unpackhi_epi16(in##7, in##1); \ ++ \ ++ tmp0l = _mm_add_epi32(_mm_madd_epi16(in##71l, pw_mf060_mf089), z3l); \ ++ tmp0h = _mm_add_epi32(_mm_madd_epi16(in##71h, pw_mf060_mf089), z3h); \ ++ tmp3l = _mm_add_epi32(_mm_madd_epi16(in##71l, pw_mf089_f060), z4l); \ ++ tmp3h = _mm_add_epi32(_mm_madd_epi16(in##71h, pw_mf089_f060), z4h); \ ++ \ ++ in##53l = _mm_unpacklo_epi16(in##5, in##3); \ ++ in##53h = _mm_unpackhi_epi16(in##5, in##3); \ ++ \ ++ tmp1l = _mm_add_epi32(_mm_madd_epi16(in##53l, pw_mf050_mf256), z4l); \ ++ tmp1h = _mm_add_epi32(_mm_madd_epi16(in##53h, pw_mf050_mf256), z4h); \ ++ tmp2l = _mm_add_epi32(_mm_madd_epi16(in##53l, pw_mf256_f050), z3l); \ ++ tmp2h = _mm_add_epi32(_mm_madd_epi16(in##53h, pw_mf256_f050), z3h); \ ++ \ ++ /* Final output stage */ \ ++ \ ++ out0l = _mm_add_epi32(tmp10l, tmp3l); \ ++ out0h = _mm_add_epi32(tmp10h, tmp3h); \ ++ out7l = _mm_sub_epi32(tmp10l, tmp3l); \ ++ out7h = _mm_sub_epi32(tmp10h, tmp3h); \ ++ \ ++ out0l = _mm_srai_epi32(out0l, DESCALE_P##PASS); \ ++ out0h = _mm_srai_epi32(out0h, DESCALE_P##PASS); \ ++ out7l = _mm_srai_epi32(out7l, DESCALE_P##PASS); \ ++ out7h = _mm_srai_epi32(out7h, DESCALE_P##PASS); \ ++ \ ++ out0 = _mm_packs_epi32(out0l, out0h); \ ++ out7 = _mm_packs_epi32(out7l, out7h); \ ++ \ ++ out1l = _mm_add_epi32(tmp11l, tmp2l); \ ++ out1h = _mm_add_epi32(tmp11h, tmp2h); \ ++ out6l = _mm_sub_epi32(tmp11l, tmp2l); \ ++ out6h = _mm_sub_epi32(tmp11h, tmp2h); \ ++ \ ++ out1l = _mm_srai_epi32(out1l, DESCALE_P##PASS); \ ++ out1h = _mm_srai_epi32(out1h, DESCALE_P##PASS); \ ++ out6l = _mm_srai_epi32(out6l, DESCALE_P##PASS); \ ++ out6h = _mm_srai_epi32(out6h, DESCALE_P##PASS); \ ++ \ ++ out1 = _mm_packs_epi32(out1l, out1h); \ ++ out6 = _mm_packs_epi32(out6l, out6h); \ ++ \ ++ out2l = _mm_add_epi32(tmp12l, tmp1l); \ ++ out2h = _mm_add_epi32(tmp12h, tmp1h); \ ++ out5l = _mm_sub_epi32(tmp12l, tmp1l); \ ++ out5h = _mm_sub_epi32(tmp12h, tmp1h); \ ++ \ ++ out2l = _mm_srai_epi32(out2l, DESCALE_P##PASS); \ ++ out2h = _mm_srai_epi32(out2h, DESCALE_P##PASS); \ ++ out5l = _mm_srai_epi32(out5l, DESCALE_P##PASS); \ ++ out5h = _mm_srai_epi32(out5h, DESCALE_P##PASS); \ ++ \ ++ out2 = _mm_packs_epi32(out2l, out2h); \ ++ out5 = _mm_packs_epi32(out5l, out5h); \ ++ \ ++ out3l = _mm_add_epi32(tmp13l, tmp0l); \ ++ out3h = _mm_add_epi32(tmp13h, tmp0h); \ ++ out4l = _mm_sub_epi32(tmp13l, tmp0l); \ ++ out4h = _mm_sub_epi32(tmp13h, tmp0h); \ ++ \ ++ out3l = _mm_srai_epi32(out3l, DESCALE_P##PASS); \ ++ out3h = _mm_srai_epi32(out3h, DESCALE_P##PASS); \ ++ out4l = _mm_srai_epi32(out4l, DESCALE_P##PASS); \ ++ out4h = _mm_srai_epi32(out4h, DESCALE_P##PASS); \ ++ \ ++ out3 = _mm_packs_epi32(out3l, out3h); \ ++ out4 = _mm_packs_epi32(out4l, out4h); \ ++} ++ ++ ++void jsimd_idct_islow_e2k(void *dct_table_, JCOEFPTR coef_block, ++ JSAMPARRAY output_buf, JDIMENSION output_col) ++{ ++ short *dct_table = (short *)dct_table_; ++ ++ __m128i row0, row1, row2, row3, row4, row5, row6, row7, ++ col0, col1, col2, col3, col4, col5, col6, col7, ++ quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7, ++ tmp0, tmp1, tmp2, tmp3, z3, z4, ++ z34l, z34h, col71l, col71h, col26l, col26h, col53l, col53h, ++ row71l, row71h, row26l, row26h, row53l, row53h, ++ out0, out1, out2, out3, out4, out5, out6, out7; ++ __m128i tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h, ++ tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h, ++ z3l, z3h, z4l, z4h, ++ out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h, out4l, out4h, ++ out5l, out5h, out6l, out6h, out7l, out7h; ++ ++ /* Constants */ ++ __m128i pw_zero = _mm_setzero_si128(), ++ pw_f130_f054 = _mm_setr_epi16(__4X2(F_0_541 + F_0_765, F_0_541)), ++ pw_f054_mf130 = _mm_setr_epi16(__4X2(F_0_541, F_0_541 - F_1_847)), ++ pw_mf078_f117 = _mm_setr_epi16(__4X2(F_1_175 - F_1_961, F_1_175)), ++ pw_f117_f078 = _mm_setr_epi16(__4X2(F_1_175, F_1_175 - F_0_390)), ++ pw_mf060_mf089 = _mm_setr_epi16(__4X2(F_0_298 - F_0_899, -F_0_899)), ++ pw_mf089_f060 = _mm_setr_epi16(__4X2(-F_0_899, F_1_501 - F_0_899)), ++ pw_mf050_mf256 = _mm_setr_epi16(__4X2(F_2_053 - F_2_562, -F_2_562)), ++ pw_mf256_f050 = _mm_setr_epi16(__4X2(-F_2_562, F_3_072 - F_2_562)), ++ pd_descale_p1 = _mm_set1_epi32(1 << (DESCALE_P1 - 1)), ++ pd_descale_p2 = _mm_set1_epi32(1 << (DESCALE_P2 - 1)); ++ ++ /* Pass 1: process columns */ ++ ++ col0 = VEC_LD(coef_block + 0 * 8); ++ col1 = VEC_LD(coef_block + 1 * 8); ++ col2 = VEC_LD(coef_block + 2 * 8); ++ col3 = VEC_LD(coef_block + 3 * 8); ++ col4 = VEC_LD(coef_block + 4 * 8); ++ col5 = VEC_LD(coef_block + 5 * 8); ++ col6 = VEC_LD(coef_block + 6 * 8); ++ col7 = VEC_LD(coef_block + 7 * 8); ++ ++ tmp1 = _mm_or_si128(col1, col2); ++ tmp2 = _mm_or_si128(col3, col4); ++ tmp1 = _mm_or_si128(tmp1, tmp2); ++ tmp3 = _mm_or_si128(col5, col6); ++ tmp3 = _mm_or_si128(tmp3, col7); ++ tmp1 = _mm_or_si128(tmp1, tmp3); ++ ++ quant0 = VEC_LD(dct_table); ++ col0 = _mm_mullo_epi16(col0, quant0); ++ ++ if (VEC_ISZERO(tmp1)) { ++ /* AC terms all zero */ ++ ++ col0 = _mm_slli_epi16(col0, PASS1_BITS); ++ IDCT_SPLAT8(col0); ++ ++ } else { ++ ++ quant1 = VEC_LD(dct_table + 1 * 8); ++ quant2 = VEC_LD(dct_table + 2 * 8); ++ quant3 = VEC_LD(dct_table + 3 * 8); ++ quant4 = VEC_LD(dct_table + 4 * 8); ++ quant5 = VEC_LD(dct_table + 5 * 8); ++ quant6 = VEC_LD(dct_table + 6 * 8); ++ quant7 = VEC_LD(dct_table + 7 * 8); ++ ++ col1 = _mm_mullo_epi16(col1, quant1); ++ col2 = _mm_mullo_epi16(col2, quant2); ++ col3 = _mm_mullo_epi16(col3, quant3); ++ col4 = _mm_mullo_epi16(col4, quant4); ++ col5 = _mm_mullo_epi16(col5, quant5); ++ col6 = _mm_mullo_epi16(col6, quant6); ++ col7 = _mm_mullo_epi16(col7, quant7); ++ ++ DO_IDCT(col, 1); ++ ++ TRANSPOSE(out, row); ++ } ++ ++ /* Pass 2: process rows */ ++ ++ DO_IDCT(row, 2); ++ ++ IDCT_SAVE(); ++} +diff --git a/simd/e2k/jquantf-e2k.c b/simd/e2k/jquantf-e2k.c +new file mode 100644 +index 0000000..106e99a +--- /dev/null ++++ b/simd/e2k/jquantf-e2k.c +@@ -0,0 +1,121 @@ ++/* ++ * Elbrus optimizations for libjpeg-turbo ++ * ++ * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. ++ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd ++ * ++ * This software is provided 'as-is', without any express or implied ++ * warranty. In no event will the authors be held liable for any damages ++ * arising from the use of this software. ++ * ++ * Permission is granted to anyone to use this software for any purpose, ++ * including commercial applications, and to alter it and redistribute it ++ * freely, subject to the following restrictions: ++ * ++ * 1. The origin of this software must not be misrepresented; you must not ++ * claim that you wrote the original software. If you use this software ++ * in a product, an acknowledgment in the product documentation would be ++ * appreciated but is not required. ++ * 2. Altered source versions must be plainly marked as such, and must not be ++ * misrepresented as being the original software. ++ * 3. This notice may not be removed or altered from any source distribution. ++ */ ++ ++/* FLOAT QUANTIZATION AND SAMPLE CONVERSION */ ++ ++#include "jsimd_e2k.h" ++ ++#define LOAD_ROW(row) in##row = VEC_LD8(sample_data[row] + start_col) ++#define STORE_ROW(i) \ ++ in0 = _mm_unpacklo_epi16(out##i, pb_zero); \ ++ in1 = _mm_unpackhi_epi16(out##i, pb_zero); \ ++ in0 = _mm_sub_epi32(in0, pd_cj); \ ++ in1 = _mm_sub_epi32(in1, pd_cj); \ ++ _mm_storeu_ps(workspace + i * 8, _mm_cvtepi32_ps(in0)); \ ++ _mm_storeu_ps(workspace + i * 8 + 4, _mm_cvtepi32_ps(in1)); ++ ++void jsimd_convsamp_float_e2k(JSAMPARRAY sample_data, JDIMENSION start_col, ++ FAST_FLOAT *workspace) ++{ ++ __m128i in0, in1, in2, in3, in4, in5, in6, in7; ++ __m128i out0, out1, out2, out3, out4, out5, out6, out7; ++ ++ /* Constants */ ++ __m128i pd_cj = _mm_set1_epi32(CENTERJSAMPLE), ++ pb_zero = _mm_setzero_si128(); ++ ++ LOAD_ROW(0); ++ LOAD_ROW(1); ++ LOAD_ROW(2); ++ LOAD_ROW(3); ++ LOAD_ROW(4); ++ LOAD_ROW(5); ++ LOAD_ROW(6); ++ LOAD_ROW(7); ++ ++ out0 = _mm_unpacklo_epi8(in0, pb_zero); ++ out1 = _mm_unpacklo_epi8(in1, pb_zero); ++ out2 = _mm_unpacklo_epi8(in2, pb_zero); ++ out3 = _mm_unpacklo_epi8(in3, pb_zero); ++ out4 = _mm_unpacklo_epi8(in4, pb_zero); ++ out5 = _mm_unpacklo_epi8(in5, pb_zero); ++ out6 = _mm_unpacklo_epi8(in6, pb_zero); ++ out7 = _mm_unpacklo_epi8(in7, pb_zero); ++ ++ STORE_ROW(0) ++ STORE_ROW(1) ++ STORE_ROW(2) ++ STORE_ROW(3) ++ STORE_ROW(4) ++ STORE_ROW(5) ++ STORE_ROW(6) ++ STORE_ROW(7) ++} ++ ++void jsimd_quantize_float_e2k(JCOEFPTR coef_block, FAST_FLOAT *divisors, ++ FAST_FLOAT *workspace) ++{ ++ int i = 0; ++ __m128 row0, row1, row2, row3, recip0, recip1, recip2, recip3; ++ __m128i out0, out1; ++#ifdef JSIMD_SAME_ROUNDING ++ __m128 pd_f16k5 = _mm_set1_ps(16384.5f); ++ __m128i pw_m16k = _mm_set1_epi16(-16384); ++#endif ++ ++ PRAGMA_E2K("ivdep") ++ for (; i < 4; i++, workspace += 16, divisors += 16, coef_block += 16) { ++ row0 = _mm_loadu_ps(workspace + 0 * 4); ++ row1 = _mm_loadu_ps(workspace + 1 * 4); ++ row2 = _mm_loadu_ps(workspace + 2 * 4); ++ row3 = _mm_loadu_ps(workspace + 3 * 4); ++ ++ recip0 = _mm_loadu_ps(divisors + 0 * 4); ++ recip1 = _mm_loadu_ps(divisors + 1 * 4); ++ recip2 = _mm_loadu_ps(divisors + 2 * 4); ++ recip3 = _mm_loadu_ps(divisors + 3 * 4); ++ ++ row0 = _mm_mul_ps(row0, recip0); ++ row1 = _mm_mul_ps(row1, recip1); ++ row2 = _mm_mul_ps(row2, recip2); ++ row3 = _mm_mul_ps(row3, recip3); ++ ++#ifdef JSIMD_SAME_ROUNDING ++ row0 = _mm_add_ps(row0, pd_f16k5); ++ row1 = _mm_add_ps(row1, pd_f16k5); ++ row2 = _mm_add_ps(row2, pd_f16k5); ++ row3 = _mm_add_ps(row3, pd_f16k5); ++ ++ out0 = _mm_packs_epi32(_mm_cvttps_epi32(row0), _mm_cvttps_epi32(row1)); ++ out1 = _mm_packs_epi32(_mm_cvttps_epi32(row2), _mm_cvttps_epi32(row3)); ++ ++ out0 = _mm_add_epi16(out0, pw_m16k); ++ out1 = _mm_add_epi16(out1, pw_m16k); ++#else ++ out0 = _mm_packs_epi32(_mm_cvtps_epi32(row0), _mm_cvtps_epi32(row1)); ++ out1 = _mm_packs_epi32(_mm_cvtps_epi32(row2), _mm_cvtps_epi32(row3)); ++#endif ++ VEC_ST(coef_block, out0); ++ VEC_ST(coef_block + 8, out1); ++ } ++} +diff --git a/simd/e2k/jquanti-e2k.c b/simd/e2k/jquanti-e2k.c +new file mode 100644 +index 0000000..a3e1ff1 +--- /dev/null ++++ b/simd/e2k/jquanti-e2k.c +@@ -0,0 +1,178 @@ ++/* ++ * Elbrus optimizations for libjpeg-turbo ++ * ++ * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. ++ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd ++ * ++ * This software is provided 'as-is', without any express or implied ++ * warranty. In no event will the authors be held liable for any damages ++ * arising from the use of this software. ++ * ++ * Permission is granted to anyone to use this software for any purpose, ++ * including commercial applications, and to alter it and redistribute it ++ * freely, subject to the following restrictions: ++ * ++ * 1. The origin of this software must not be misrepresented; you must not ++ * claim that you wrote the original software. If you use this software ++ * in a product, an acknowledgment in the product documentation would be ++ * appreciated but is not required. ++ * 2. Altered source versions must be plainly marked as such, and must not be ++ * misrepresented as being the original software. ++ * 3. This notice may not be removed or altered from any source distribution. ++ */ ++ ++/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */ ++ ++#include "jsimd_e2k.h" ++ ++#define LOAD_ROW(row) in##row = VEC_LD8(sample_data[row] + start_col) ++ ++void jsimd_convsamp_e2k(JSAMPARRAY sample_data, JDIMENSION start_col, ++ DCTELEM *workspace) ++{ ++ __m128i in0, in1, in2, in3, in4, in5, in6, in7; ++ __m128i out0, out1, out2, out3, out4, out5, out6, out7; ++ ++ /* Constants */ ++ __m128i pw_cj = _mm_set1_epi16(CENTERJSAMPLE), ++ pb_zero = _mm_setzero_si128(); ++ ++ LOAD_ROW(0); ++ LOAD_ROW(1); ++ LOAD_ROW(2); ++ LOAD_ROW(3); ++ LOAD_ROW(4); ++ LOAD_ROW(5); ++ LOAD_ROW(6); ++ LOAD_ROW(7); ++ ++ out0 = _mm_unpacklo_epi8(in0, pb_zero); ++ out1 = _mm_unpacklo_epi8(in1, pb_zero); ++ out2 = _mm_unpacklo_epi8(in2, pb_zero); ++ out3 = _mm_unpacklo_epi8(in3, pb_zero); ++ out4 = _mm_unpacklo_epi8(in4, pb_zero); ++ out5 = _mm_unpacklo_epi8(in5, pb_zero); ++ out6 = _mm_unpacklo_epi8(in6, pb_zero); ++ out7 = _mm_unpacklo_epi8(in7, pb_zero); ++ ++ out0 = _mm_sub_epi16(out0, pw_cj); ++ out1 = _mm_sub_epi16(out1, pw_cj); ++ out2 = _mm_sub_epi16(out2, pw_cj); ++ out3 = _mm_sub_epi16(out3, pw_cj); ++ out4 = _mm_sub_epi16(out4, pw_cj); ++ out5 = _mm_sub_epi16(out5, pw_cj); ++ out6 = _mm_sub_epi16(out6, pw_cj); ++ out7 = _mm_sub_epi16(out7, pw_cj); ++ ++ VEC_ST(workspace + 0 * 8, out0); ++ VEC_ST(workspace + 1 * 8, out1); ++ VEC_ST(workspace + 2 * 8, out2); ++ VEC_ST(workspace + 3 * 8, out3); ++ VEC_ST(workspace + 4 * 8, out4); ++ VEC_ST(workspace + 5 * 8, out5); ++ VEC_ST(workspace + 6 * 8, out6); ++ VEC_ST(workspace + 7 * 8, out7); ++} ++ ++ ++#define WORD_BIT 16 ++#define MULTIPLY(vs0, vs1, out) out = _mm_mulhi_epu16(vs0, vs1) ++ ++void jsimd_quantize_e2k(JCOEFPTR coef_block, DCTELEM *divisors, ++ DCTELEM *workspace) ++{ ++ __m128i row0, row1, row2, row3, row4, row5, row6, row7, ++ row0s, row1s, row2s, row3s, row4s, row5s, row6s, row7s, ++ corr0, corr1, corr2, corr3, corr4, corr5, corr6, corr7, ++ recip0, recip1, recip2, recip3, recip4, recip5, recip6, recip7, ++ scale0, scale1, scale2, scale3, scale4, scale5, scale6, scale7; ++ ++ row0s = VEC_LD(workspace + 0 * 8); ++ row1s = VEC_LD(workspace + 1 * 8); ++ row2s = VEC_LD(workspace + 2 * 8); ++ row3s = VEC_LD(workspace + 3 * 8); ++ row4s = VEC_LD(workspace + 4 * 8); ++ row5s = VEC_LD(workspace + 5 * 8); ++ row6s = VEC_LD(workspace + 6 * 8); ++ row7s = VEC_LD(workspace + 7 * 8); ++ row0 = _mm_abs_epi16(row0s); ++ row1 = _mm_abs_epi16(row1s); ++ row2 = _mm_abs_epi16(row2s); ++ row3 = _mm_abs_epi16(row3s); ++ row4 = _mm_abs_epi16(row4s); ++ row5 = _mm_abs_epi16(row5s); ++ row6 = _mm_abs_epi16(row6s); ++ row7 = _mm_abs_epi16(row7s); ++ ++ corr0 = VEC_LD(divisors + DCTSIZE2 + 0 * 8); ++ corr1 = VEC_LD(divisors + DCTSIZE2 + 1 * 8); ++ corr2 = VEC_LD(divisors + DCTSIZE2 + 2 * 8); ++ corr3 = VEC_LD(divisors + DCTSIZE2 + 3 * 8); ++ corr4 = VEC_LD(divisors + DCTSIZE2 + 4 * 8); ++ corr5 = VEC_LD(divisors + DCTSIZE2 + 5 * 8); ++ corr6 = VEC_LD(divisors + DCTSIZE2 + 6 * 8); ++ corr7 = VEC_LD(divisors + DCTSIZE2 + 7 * 8); ++ ++ row0 = _mm_add_epi16(row0, corr0); ++ row1 = _mm_add_epi16(row1, corr1); ++ row2 = _mm_add_epi16(row2, corr2); ++ row3 = _mm_add_epi16(row3, corr3); ++ row4 = _mm_add_epi16(row4, corr4); ++ row5 = _mm_add_epi16(row5, corr5); ++ row6 = _mm_add_epi16(row6, corr6); ++ row7 = _mm_add_epi16(row7, corr7); ++ ++ recip0 = VEC_LD(divisors + 0 * 8); ++ recip1 = VEC_LD(divisors + 1 * 8); ++ recip2 = VEC_LD(divisors + 2 * 8); ++ recip3 = VEC_LD(divisors + 3 * 8); ++ recip4 = VEC_LD(divisors + 4 * 8); ++ recip5 = VEC_LD(divisors + 5 * 8); ++ recip6 = VEC_LD(divisors + 6 * 8); ++ recip7 = VEC_LD(divisors + 7 * 8); ++ ++ MULTIPLY(row0, recip0, row0); ++ MULTIPLY(row1, recip1, row1); ++ MULTIPLY(row2, recip2, row2); ++ MULTIPLY(row3, recip3, row3); ++ MULTIPLY(row4, recip4, row4); ++ MULTIPLY(row5, recip5, row5); ++ MULTIPLY(row6, recip6, row6); ++ MULTIPLY(row7, recip7, row7); ++ ++ scale0 = VEC_LD(divisors + DCTSIZE2 * 2 + 0 * 8); ++ scale1 = VEC_LD(divisors + DCTSIZE2 * 2 + 1 * 8); ++ scale2 = VEC_LD(divisors + DCTSIZE2 * 2 + 2 * 8); ++ scale3 = VEC_LD(divisors + DCTSIZE2 * 2 + 3 * 8); ++ scale4 = VEC_LD(divisors + DCTSIZE2 * 2 + 4 * 8); ++ scale5 = VEC_LD(divisors + DCTSIZE2 * 2 + 5 * 8); ++ scale6 = VEC_LD(divisors + DCTSIZE2 * 2 + 6 * 8); ++ scale7 = VEC_LD(divisors + DCTSIZE2 * 2 + 7 * 8); ++ ++ MULTIPLY(row0, scale0, row0); ++ MULTIPLY(row1, scale1, row1); ++ MULTIPLY(row2, scale2, row2); ++ MULTIPLY(row3, scale3, row3); ++ MULTIPLY(row4, scale4, row4); ++ MULTIPLY(row5, scale5, row5); ++ MULTIPLY(row6, scale6, row6); ++ MULTIPLY(row7, scale7, row7); ++ ++ row0 = _mm_sign_epi16(row0, row0s); ++ row1 = _mm_sign_epi16(row1, row1s); ++ row2 = _mm_sign_epi16(row2, row2s); ++ row3 = _mm_sign_epi16(row3, row3s); ++ row4 = _mm_sign_epi16(row4, row4s); ++ row5 = _mm_sign_epi16(row5, row5s); ++ row6 = _mm_sign_epi16(row6, row6s); ++ row7 = _mm_sign_epi16(row7, row7s); ++ ++ VEC_ST(coef_block + 0 * 8, row0); ++ VEC_ST(coef_block + 1 * 8, row1); ++ VEC_ST(coef_block + 2 * 8, row2); ++ VEC_ST(coef_block + 3 * 8, row3); ++ VEC_ST(coef_block + 4 * 8, row4); ++ VEC_ST(coef_block + 5 * 8, row5); ++ VEC_ST(coef_block + 6 * 8, row6); ++ VEC_ST(coef_block + 7 * 8, row7); ++} +diff --git a/simd/e2k/jsimd.c b/simd/e2k/jsimd.c +new file mode 100644 +index 0000000..f8c0465 +--- /dev/null ++++ b/simd/e2k/jsimd.c +@@ -0,0 +1,761 @@ ++/* ++ * jsimd_e2k.c ++ * ++ * Copyright 2009 Pierre Ossman for Cendio AB ++ * Copyright (C) 2009-2011, 2014-2016, 2018, D. R. Commander. ++ * Copyright (C) 2015-2016, 2018, Matthieu Darbois. ++ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd ++ * ++ * Based on the x86 SIMD extension for IJG JPEG library, ++ * Copyright (C) 1999-2006, MIYASAKA Masaru. ++ * For conditions of distribution and use, see copyright notice in jsimdext.inc ++ * ++ * This file contains the interface between the "normal" portions ++ * of the library and the SIMD implementations when running on a ++ * PowerPC architecture. ++ */ ++ ++#define JPEG_INTERNALS ++#include "../../jinclude.h" ++#include "../../jpeglib.h" ++#include "../../jsimd.h" ++#include "../../jdct.h" ++#include "../../jsimddct.h" ++#include "../jsimd.h" ++#include "jsimd_api_e2k.h" ++ ++static unsigned int simd_support = ~0; ++static unsigned int simd_huffman = 1; ++ ++/* ++ * Check what SIMD accelerations are supported. ++ * ++ * FIXME: This code is racy under a multi-threaded environment. ++ */ ++LOCAL(void) ++init_simd(void) ++{ ++#ifndef NO_GETENV ++ char *env = NULL; ++#endif ++ ++ if (simd_support != ~0U) ++ return; ++ ++ simd_support = JSIMD_SSE2; ++ ++#ifndef NO_GETENV ++ /* Force different settings through environment variables */ ++ env = getenv("JSIMD_FORCENONE"); ++ if ((env != NULL) && (strcmp(env, "1") == 0)) ++ simd_support = 0; ++ env = getenv("JSIMD_NOHUFFENC"); ++ if ((env != NULL) && (strcmp(env, "1") == 0)) ++ simd_huffman = 0; ++#endif ++} ++ ++static inline int color_space_idx(J_COLOR_SPACE color_space) { ++ switch (color_space) { ++ case JCS_EXT_RGB: ++ return 1 + (EXT_RGB_PIXELSIZE != 3) * 16; ++ case JCS_EXT_RGBX: ++ case JCS_EXT_RGBA: ++ return 2 + (EXT_RGBX_PIXELSIZE != 3) * 16; ++ case JCS_EXT_BGR: ++ return 3 + (EXT_BGR_PIXELSIZE != 3) * 16; ++ case JCS_EXT_BGRX: ++ case JCS_EXT_BGRA: ++ return 4 + (EXT_BGRX_PIXELSIZE != 3) * 16; ++ case JCS_EXT_XBGR: ++ case JCS_EXT_ABGR: ++ return 5 + (EXT_XBGR_PIXELSIZE != 3) * 16; ++ case JCS_EXT_XRGB: ++ case JCS_EXT_ARGB: ++ return 6 + (EXT_XRGB_PIXELSIZE != 3) * 16; ++ default: ++ break; ++ } ++ return 0 + (RGB_PIXELSIZE != 3) * 16; ++} ++ ++GLOBAL(int) ++jsimd_can_rgb_ycc(void) ++{ ++ init_simd(); ++ ++ /* The code is optimised for these values only */ ++ if (BITS_IN_JSAMPLE != 8) ++ return 0; ++ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) ++ return 0; ++ ++ if (simd_support & JSIMD_SSE2) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(int) ++jsimd_can_rgb_gray(void) ++{ ++ init_simd(); ++ ++ /* The code is optimised for these values only */ ++ if (BITS_IN_JSAMPLE != 8) ++ return 0; ++ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) ++ return 0; ++ ++ if (simd_support & JSIMD_SSE2) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(int) ++jsimd_can_ycc_rgb(void) ++{ ++ init_simd(); ++ ++ /* The code is optimised for these values only */ ++ if (BITS_IN_JSAMPLE != 8) ++ return 0; ++ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) ++ return 0; ++ ++ if (simd_support & JSIMD_SSE2) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(int) ++jsimd_can_ycc_rgb565(void) ++{ ++ return 0; ++} ++ ++GLOBAL(void) ++jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, ++ JSAMPIMAGE output_buf, JDIMENSION output_row, ++ int num_rows) ++{ ++ void (*e2kfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int, int); ++ int idx = color_space_idx(cinfo->in_color_space); ++ ++ e2kfct = idx < 16 ? jsimd_rgb3_ycc_convert_e2k : ++ jsimd_rgb4_ycc_convert_e2k; ++ idx &= 15; ++ ++ e2kfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows, idx); ++} ++ ++GLOBAL(void) ++jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, ++ JSAMPIMAGE output_buf, JDIMENSION output_row, ++ int num_rows) ++{ ++ void (*e2kfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int, int); ++ int idx = color_space_idx(cinfo->in_color_space); ++ ++ e2kfct = idx < 16 ? jsimd_rgb3_gray_convert_e2k : ++ jsimd_rgb4_gray_convert_e2k; ++ idx &= 15; ++ ++ e2kfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows, idx); ++} ++ ++GLOBAL(void) ++jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, ++ JDIMENSION input_row, JSAMPARRAY output_buf, ++ int num_rows) ++{ ++ void (*e2kfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int, int); ++ int idx = color_space_idx(cinfo->out_color_space); ++ ++ e2kfct = idx < 16 ? jsimd_ycc_rgb3_convert_e2k : ++ jsimd_ycc_rgb4_convert_e2k; ++ idx &= 15; ++ ++ e2kfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows, idx); ++} ++ ++GLOBAL(void) ++jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, ++ JDIMENSION input_row, JSAMPARRAY output_buf, ++ int num_rows) ++{ ++} ++ ++GLOBAL(int) ++jsimd_can_h2v2_downsample(void) ++{ ++ init_simd(); ++ ++ /* The code is optimised for these values only */ ++ if (BITS_IN_JSAMPLE != 8) ++ return 0; ++ ++ if (simd_support & JSIMD_SSE2) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(int) ++jsimd_can_h2v1_downsample(void) ++{ ++ init_simd(); ++ ++ /* The code is optimised for these values only */ ++ if (BITS_IN_JSAMPLE != 8) ++ return 0; ++ ++ if (simd_support & JSIMD_SSE2) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(void) ++jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, ++ JSAMPARRAY input_data, JSAMPARRAY output_data) ++{ ++ jsimd_h2v2_downsample_e2k(cinfo->image_width, cinfo->max_v_samp_factor, ++ compptr->v_samp_factor, ++ compptr->width_in_blocks, input_data, ++ output_data); ++} ++ ++GLOBAL(void) ++jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, ++ JSAMPARRAY input_data, JSAMPARRAY output_data) ++{ ++ jsimd_h2v1_downsample_e2k(cinfo->image_width, cinfo->max_v_samp_factor, ++ compptr->v_samp_factor, ++ compptr->width_in_blocks, input_data, ++ output_data); ++} ++ ++GLOBAL(int) ++jsimd_can_h2v2_upsample(void) ++{ ++ init_simd(); ++ ++ /* The code is optimised for these values only */ ++ if (BITS_IN_JSAMPLE != 8) ++ return 0; ++ ++ if (simd_support & JSIMD_SSE2) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(int) ++jsimd_can_h2v1_upsample(void) ++{ ++ init_simd(); ++ ++ /* The code is optimised for these values only */ ++ if (BITS_IN_JSAMPLE != 8) ++ return 0; ++ ++ if (simd_support & JSIMD_SSE2) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(void) ++jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, ++ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) ++{ ++ jsimd_h2v2_upsample_e2k(cinfo->max_v_samp_factor, cinfo->output_width, ++ input_data, output_data_ptr); ++} ++ ++GLOBAL(void) ++jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, ++ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) ++{ ++ jsimd_h2v1_upsample_e2k(cinfo->max_v_samp_factor, cinfo->output_width, ++ input_data, output_data_ptr); ++} ++ ++GLOBAL(int) ++jsimd_can_h2v2_fancy_upsample(void) ++{ ++ init_simd(); ++ ++ /* The code is optimised for these values only */ ++ if (BITS_IN_JSAMPLE != 8) ++ return 0; ++ ++ if (simd_support & JSIMD_SSE2) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(int) ++jsimd_can_h2v1_fancy_upsample(void) ++{ ++ init_simd(); ++ ++ /* The code is optimised for these values only */ ++ if (BITS_IN_JSAMPLE != 8) ++ return 0; ++ ++ if (simd_support & JSIMD_SSE2) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(void) ++jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, ++ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) ++{ ++ jsimd_h2v2_fancy_upsample_e2k(cinfo->max_v_samp_factor, ++ compptr->downsampled_width, input_data, ++ output_data_ptr); ++} ++ ++GLOBAL(void) ++jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, ++ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) ++{ ++ jsimd_h2v1_fancy_upsample_e2k(cinfo->max_v_samp_factor, ++ compptr->downsampled_width, input_data, ++ output_data_ptr); ++} ++ ++GLOBAL(int) ++jsimd_can_h2v2_merged_upsample(void) ++{ ++ init_simd(); ++ ++ /* The code is optimised for these values only */ ++ if (BITS_IN_JSAMPLE != 8) ++ return 0; ++ ++ if (simd_support & JSIMD_SSE2) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(int) ++jsimd_can_h2v1_merged_upsample(void) ++{ ++ init_simd(); ++ ++ /* The code is optimised for these values only */ ++ if (BITS_IN_JSAMPLE != 8) ++ return 0; ++ ++ if (simd_support & JSIMD_SSE2) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(void) ++jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, ++ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf) ++{ ++ void (*e2kfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JDIMENSION, JSAMPARRAY, int); ++ int idx = color_space_idx(cinfo->out_color_space); ++ ++ e2kfct = idx < 16 ? jsimd_ycc_rgb3_merged_upsample_e2k : ++ jsimd_ycc_rgb4_merged_upsample_e2k; ++ idx &= 15; ++ ++ e2kfct(cinfo->output_width, input_buf, in_row_group_ctr, ++ in_row_group_ctr * 2, output_buf, idx); ++ e2kfct(cinfo->output_width, input_buf, in_row_group_ctr, ++ in_row_group_ctr * 2 + 1, output_buf + 1, idx); ++} ++ ++GLOBAL(void) ++jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, ++ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf) ++{ ++ void (*e2kfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JDIMENSION, JSAMPARRAY, int); ++ int idx = color_space_idx(cinfo->out_color_space); ++ ++ e2kfct = idx < 16 ? jsimd_ycc_rgb3_merged_upsample_e2k : ++ jsimd_ycc_rgb4_merged_upsample_e2k; ++ idx &= 15; ++ ++ e2kfct(cinfo->output_width, input_buf, in_row_group_ctr, ++ in_row_group_ctr, output_buf, idx); ++} ++ ++GLOBAL(int) ++jsimd_can_convsamp(void) ++{ ++ init_simd(); ++ ++ /* The code is optimised for these values only */ ++ if (DCTSIZE != 8) ++ return 0; ++ if (BITS_IN_JSAMPLE != 8) ++ return 0; ++ if (sizeof(DCTELEM) != 2) ++ return 0; ++ ++ if (simd_support & JSIMD_SSE2) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(int) ++jsimd_can_convsamp_float(void) ++{ ++ init_simd(); ++ ++ /* The code is optimised for these values only */ ++ if (DCTSIZE != 8) ++ return 0; ++ if (BITS_IN_JSAMPLE != 8) ++ return 0; ++ if (sizeof(FAST_FLOAT) != 4) ++ return 0; ++ ++ if (simd_support & JSIMD_SSE2) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(void) ++jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col, ++ DCTELEM *workspace) ++{ ++ jsimd_convsamp_e2k(sample_data, start_col, workspace); ++} ++ ++GLOBAL(void) ++jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col, ++ FAST_FLOAT *workspace) ++{ ++ jsimd_convsamp_float_e2k(sample_data, start_col, workspace); ++} ++ ++GLOBAL(int) ++jsimd_can_fdct_islow(void) ++{ ++ init_simd(); ++ ++ /* The code is optimised for these values only */ ++ if (DCTSIZE != 8) ++ return 0; ++ if (sizeof(DCTELEM) != 2) ++ return 0; ++ ++ if (simd_support & JSIMD_SSE2) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(int) ++jsimd_can_fdct_ifast(void) ++{ ++ init_simd(); ++ ++ /* The code is optimised for these values only */ ++ if (DCTSIZE != 8) ++ return 0; ++ if (sizeof(DCTELEM) != 2) ++ return 0; ++ ++ if (simd_support & JSIMD_SSE2) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(int) ++jsimd_can_fdct_float(void) ++{ ++ init_simd(); ++ ++ /* The code is optimised for these values only */ ++ if (DCTSIZE != 8) ++ return 0; ++ if (sizeof(FAST_FLOAT) != 4) ++ return 0; ++ ++ if (simd_support & JSIMD_SSE2) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(void) ++jsimd_fdct_islow(DCTELEM *data) ++{ ++ jsimd_fdct_islow_e2k(data); ++} ++ ++GLOBAL(void) ++jsimd_fdct_ifast(DCTELEM *data) ++{ ++ jsimd_fdct_ifast_e2k(data); ++} ++ ++GLOBAL(void) ++jsimd_fdct_float(FAST_FLOAT *data) ++{ ++ jsimd_fdct_float_e2k(data); ++} ++ ++GLOBAL(int) ++jsimd_can_quantize(void) ++{ ++ init_simd(); ++ ++ /* The code is optimised for these values only */ ++ if (DCTSIZE != 8) ++ return 0; ++ if (sizeof(JCOEF) != 2) ++ return 0; ++ if (sizeof(DCTELEM) != 2) ++ return 0; ++ ++ if (simd_support & JSIMD_SSE2) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(int) ++jsimd_can_quantize_float(void) ++{ ++ init_simd(); ++ ++ /* The code is optimised for these values only */ ++ if (DCTSIZE != 8) ++ return 0; ++ if (sizeof(JCOEF) != 2) ++ return 0; ++ if (sizeof(FAST_FLOAT) != 4) ++ return 0; ++ ++ if (simd_support & JSIMD_SSE2) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(void) ++jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace) ++{ ++ jsimd_quantize_e2k(coef_block, divisors, workspace); ++} ++ ++GLOBAL(void) ++jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors, ++ FAST_FLOAT *workspace) ++{ ++ jsimd_quantize_float_e2k(coef_block, divisors, workspace); ++} ++ ++GLOBAL(int) ++jsimd_can_idct_2x2(void) ++{ ++ return 0; ++} ++ ++GLOBAL(int) ++jsimd_can_idct_4x4(void) ++{ ++ return 0; ++} ++ ++GLOBAL(void) ++jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr, ++ JCOEFPTR coef_block, JSAMPARRAY output_buf, ++ JDIMENSION output_col) ++{ ++} ++ ++GLOBAL(void) ++jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr, ++ JCOEFPTR coef_block, JSAMPARRAY output_buf, ++ JDIMENSION output_col) ++{ ++} ++ ++GLOBAL(int) ++jsimd_can_idct_islow(void) ++{ ++ init_simd(); ++ ++ /* The code is optimised for these values only */ ++ if (DCTSIZE != 8) ++ return 0; ++ if (sizeof(JCOEF) != 2) ++ return 0; ++ if (BITS_IN_JSAMPLE != 8) ++ return 0; ++ if (sizeof(ISLOW_MULT_TYPE) != 2) ++ return 0; ++ ++ if (simd_support & JSIMD_SSE2) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(int) ++jsimd_can_idct_ifast(void) ++{ ++ init_simd(); ++ ++ /* The code is optimised for these values only */ ++ if (DCTSIZE != 8) ++ return 0; ++ if (sizeof(JCOEF) != 2) ++ return 0; ++ if (BITS_IN_JSAMPLE != 8) ++ return 0; ++ if (sizeof(IFAST_MULT_TYPE) != 2) ++ return 0; ++ if (IFAST_SCALE_BITS != 2) ++ return 0; ++ ++ if (simd_support & JSIMD_SSE2) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(int) ++jsimd_can_idct_float(void) ++{ ++ init_simd(); ++ ++ /* The code is optimised for these values only */ ++ if (DCTSIZE != 8) ++ return 0; ++ if (sizeof(JCOEF) != 2) ++ return 0; ++ if (BITS_IN_JSAMPLE != 8) ++ return 0; ++ if (sizeof(FAST_FLOAT) != 4) ++ return 0; ++ if (sizeof(FLOAT_MULT_TYPE) != 4) ++ return 0; ++ ++ if (simd_support & JSIMD_SSE2) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(void) ++jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr, ++ JCOEFPTR coef_block, JSAMPARRAY output_buf, ++ JDIMENSION output_col) ++{ ++ jsimd_idct_islow_e2k(compptr->dct_table, coef_block, output_buf, ++ output_col); ++} ++ ++GLOBAL(void) ++jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr, ++ JCOEFPTR coef_block, JSAMPARRAY output_buf, ++ JDIMENSION output_col) ++{ ++ jsimd_idct_ifast_e2k(compptr->dct_table, coef_block, output_buf, ++ output_col); ++} ++ ++GLOBAL(void) ++jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr, ++ JCOEFPTR coef_block, JSAMPARRAY output_buf, ++ JDIMENSION output_col) ++{ ++ jsimd_idct_float_e2k(compptr->dct_table, coef_block, output_buf, ++ output_col); ++} ++ ++GLOBAL(int) ++jsimd_can_huff_encode_one_block(void) ++{ ++ init_simd(); ++ ++ if (DCTSIZE != 8) ++ return 0; ++ if (sizeof(JCOEF) != 2) ++ return 0; ++ ++ if ((simd_support & JSIMD_SSE2) && simd_huffman) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(JOCTET *) ++jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block, ++ int last_dc_val, c_derived_tbl *dctbl, ++ c_derived_tbl *actbl) ++{ ++ return jsimd_huff_encode_one_block_e2k(state, buffer, block, last_dc_val, ++ dctbl, actbl); ++} ++ ++GLOBAL(int) ++jsimd_can_encode_mcu_AC_first_prepare(void) ++{ ++ init_simd(); ++ ++ if (DCTSIZE != 8) ++ return 0; ++ if (sizeof(JCOEF) != 2) ++ return 0; ++ if ((simd_support & JSIMD_SSE2) && simd_huffman) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(void) ++jsimd_encode_mcu_AC_first_prepare(const JCOEF *block, ++ const int *jpeg_natural_order_start, int Sl, ++ int Al, UJCOEF *values, size_t *zerobits) ++{ ++ jsimd_encode_mcu_AC_first_prepare_e2k(block, jpeg_natural_order_start, ++ Sl, Al, (JCOEF*)values, zerobits); ++} ++ ++GLOBAL(int) ++jsimd_can_encode_mcu_AC_refine_prepare(void) ++{ ++ init_simd(); ++ ++ if (DCTSIZE != 8) ++ return 0; ++ if (sizeof(JCOEF) != 2) ++ return 0; ++ if ((simd_support & JSIMD_SSE2) && simd_huffman) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(int) ++jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block, ++ const int *jpeg_natural_order_start, int Sl, ++ int Al, UJCOEF *absvalues, size_t *bits) ++{ ++ return jsimd_encode_mcu_AC_refine_prepare_e2k(block, ++ jpeg_natural_order_start, ++ Sl, Al, (JCOEF*)absvalues, bits); ++} +diff --git a/simd/e2k/jsimd_api_e2k.h b/simd/e2k/jsimd_api_e2k.h +new file mode 100644 +index 0000000..d857203 +--- /dev/null ++++ b/simd/e2k/jsimd_api_e2k.h +@@ -0,0 +1,94 @@ ++/* ++ * Elbrus optimizations for libjpeg-turbo ++ * ++ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd ++ * ++ * This software is provided 'as-is', without any express or implied ++ * warranty. In no event will the authors be held liable for any damages ++ * arising from the use of this software. ++ * ++ * Permission is granted to anyone to use this software for any purpose, ++ * including commercial applications, and to alter it and redistribute it ++ * freely, subject to the following restrictions: ++ * ++ * 1. The origin of this software must not be misrepresented; you must not ++ * claim that you wrote the original software. If you use this software ++ * in a product, an acknowledgment in the product documentation would be ++ * appreciated but is not required. ++ * 2. Altered source versions must be plainly marked as such, and must not be ++ * misrepresented as being the original software. ++ * 3. This notice may not be removed or altered from any source distribution. ++ */ ++ ++/* Function declarations */ ++ ++#define CONVERT_DECL(n) \ ++EXTERN(void) jsimd_rgb##n##_ycc_convert_e2k(JDIMENSION img_width, \ ++ JSAMPARRAY input_buf, JSAMPIMAGE output_buf, \ ++ JDIMENSION output_row, int num_rows, int shuf_idx); \ ++EXTERN(void) jsimd_rgb##n##_gray_convert_e2k(JDIMENSION img_width, \ ++ JSAMPARRAY input_buf, JSAMPIMAGE output_buf, \ ++ JDIMENSION output_row, int num_rows, int shuf_idx); \ ++EXTERN(void) jsimd_ycc_rgb##n##_convert_e2k(JDIMENSION out_width, \ ++ JSAMPIMAGE input_buf, JDIMENSION input_row, \ ++ JSAMPARRAY output_buf, int num_rows, int shuf_idx); \ ++EXTERN(void) jsimd_ycc_rgb##n##_convert_e2k(JDIMENSION out_width, \ ++ JSAMPIMAGE input_buf, JDIMENSION input_row, JSAMPARRAY output_buf, \ ++ int num_rows, int shuf_idx); \ ++EXTERN(void) jsimd_ycc_rgb##n##_merged_upsample_e2k(JDIMENSION out_width, \ ++ JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, \ ++ JDIMENSION in_row_group_ctr_y, JSAMPARRAY output_buf, int shuf_idx); \ ++ ++CONVERT_DECL(3) ++CONVERT_DECL(4) ++ ++EXTERN(void) jsimd_h2v1_downsample_e2k ++ (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor, ++ JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data); ++EXTERN(void) jsimd_h2v2_downsample_e2k ++ (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor, ++ JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data); ++ ++#define UPSAMPLE_DECL(name) \ ++EXTERN(void) jsimd_##name##_upsample_e2k \ ++ (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, \ ++ JSAMPARRAY *output_data_ptr); ++ ++UPSAMPLE_DECL(h2v1) ++UPSAMPLE_DECL(h2v2) ++UPSAMPLE_DECL(h2v1_fancy) ++UPSAMPLE_DECL(h2v2_fancy) ++ ++EXTERN(void) jsimd_convsamp_e2k ++ (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace); ++EXTERN(void) jsimd_convsamp_float_e2k ++ (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace); ++ ++EXTERN(void) jsimd_fdct_islow_e2k(DCTELEM *data); ++EXTERN(void) jsimd_fdct_ifast_e2k(DCTELEM *data); ++EXTERN(void) jsimd_fdct_float_e2k(FAST_FLOAT *data); ++EXTERN(void) jsimd_quantize_e2k ++ (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace); ++EXTERN(void) jsimd_quantize_float_e2k ++ (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace); ++EXTERN(void) jsimd_idct_islow_e2k ++ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, ++ JDIMENSION output_col); ++EXTERN(void) jsimd_idct_ifast_e2k ++ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, ++ JDIMENSION output_col); ++EXTERN(void) jsimd_idct_float_e2k ++ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, ++ JDIMENSION output_col); ++ ++EXTERN(JOCTET *) jsimd_huff_encode_one_block_e2k ++ (void *state, JOCTET *buffer, JCOEFPTR block, ++ int last_dc_val, c_derived_tbl *dctbl, c_derived_tbl *actbl); ++ ++EXTERN(void) jsimd_encode_mcu_AC_first_prepare_e2k ++ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, ++ JCOEF *values, size_t *zerobits); ++ ++EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_e2k ++ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, ++ JCOEF *absvalues, size_t *bits); +diff --git a/simd/e2k/jsimd_e2k.h b/simd/e2k/jsimd_e2k.h +new file mode 100644 +index 0000000..15d6262 +--- /dev/null ++++ b/simd/e2k/jsimd_e2k.h +@@ -0,0 +1,207 @@ ++/* ++ * Elbrus optimizations for libjpeg-turbo ++ * ++ * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. ++ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd ++ * ++ * This software is provided 'as-is', without any express or implied ++ * warranty. In no event will the authors be held liable for any damages ++ * arising from the use of this software. ++ * ++ * Permission is granted to anyone to use this software for any purpose, ++ * including commercial applications, and to alter it and redistribute it ++ * freely, subject to the following restrictions: ++ * ++ * 1. The origin of this software must not be misrepresented; you must not ++ * claim that you wrote the original software. If you use this software ++ * in a product, an acknowledgment in the product documentation would be ++ * appreciated but is not required. ++ * 2. Altered source versions must be plainly marked as such, and must not be ++ * misrepresented as being the original software. ++ * 3. This notice may not be removed or altered from any source distribution. ++ */ ++ ++#define JPEG_INTERNALS ++#include "../../jinclude.h" ++#include "../../jpeglib.h" ++#include "../../jsimd.h" ++#include "../../jdct.h" ++#include "../../jsimddct.h" ++#include "../jsimd.h" ++#include "jsimd_api_e2k.h" ++#include ++#include /* SSE4.1 */ ++ ++ ++/* Common code */ ++ ++#define __4X2(a, b) a, b, a, b, a, b, a, b ++#define ALWAYS_INLINE __attribute__((__always_inline__)) inline ++ ++#ifdef __e2k__ ++#define PRAGMA_E2K _Pragma ++#define _mm_shuffle2_pi8(a, b, c) \ ++ ((__m64)__builtin_e2k_pshufb((uint64_t)(b), (uint64_t)(a), (uint64_t)(c))) ++#define _mm_shuffle2_epi8(a, b, c) \ ++ ((__m128i)__builtin_e2k_qppermb((__v2di)(b), (__v2di)(a), (__v2di)(c))) ++#define _mm_blendv_pi8(a, b, c) \ ++ ((__m64)__builtin_e2k_pmerge((uint64_t)(a), (uint64_t)(b), (uint64_t)(c))) ++#else ++#define PRAGMA_E2K(x) ++#define _mm_shuffle2_pi8(a, b, c) \ ++ _mm_movepi64_pi64(_mm_shuffle_epi8(_mm_unpacklo_epi64( \ ++ _mm_movpi64_epi64(a), _mm_movpi64_epi64(b)), _mm_movpi64_epi64(c))) ++#define _mm_shuffle2_epi8(a, b, c) \ ++ _mm_blendv_epi8(_mm_shuffle_epi8(a, c), _mm_shuffle_epi8(b, c), \ ++ _mm_slli_epi16(c, 3)) ++#define _mm_blendv_pi8(a, b, c) \ ++ _mm_movepi64_pi64(_mm_blendv_epi8(_mm_movpi64_epi64(a), \ ++ _mm_movpi64_epi64(b), _mm_movpi64_epi64(c))) ++ ++#define BITREV_ROUND(c, i) a = (a & c) << i | (a >> i & c); ++static ALWAYS_INLINE uint64_t __builtin_e2k_bitrevd(uint64_t a) { ++ BITREV_ROUND(0x5555555555555555ll, 1) ++ BITREV_ROUND(0x3333333333333333ll, 2) ++ BITREV_ROUND(0x0F0F0F0F0F0F0F0Fll, 4) ++ BITREV_ROUND(0x00FF00FF00FF00FFll, 8) ++ BITREV_ROUND(0x0000FFFF0000FFFFll, 16) ++ return a >> 32 | a << 32; ++} ++ ++static ALWAYS_INLINE uint64_t __builtin_e2k_insfd(uint64_t a, uint64_t b, uint64_t c) { ++ int n = b & 63; ++ a = a >> n | a << (64 - n); ++ return c ^ ((a ^ c) & (~0ll << (b >> 6 & 63))); ++} ++#endif ++ ++#if defined(__iset__) && __iset__ >= 5 ++static ALWAYS_INLINE __m128i _mm_packhi_epi32(__m128i a, __m128i b) { ++ __m128i index = _mm_setr_epi8( ++ 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31); ++ return _mm_shuffle2_epi8(a, b, index); ++} ++ ++#define VEC_ISZERO(a) !_mm_cvtsi128_si64(_mm_packs_epi16(a, a)) ++#else ++static ALWAYS_INLINE __m128i _mm_packhi_epi32(__m128i a, __m128i b) { ++ union { __m128i v; __m64 d[2]; } l = { a }, h = { b }, x; ++ __m64 index = _mm_setr_pi8(2, 3, 6, 7, 10, 11, 14, 15); ++ x.d[0] = _mm_shuffle2_pi8(l.d[0], l.d[1], index); ++ x.d[1] = _mm_shuffle2_pi8(h.d[0], h.d[1], index); ++ return x.v; ++} ++ ++static ALWAYS_INLINE uint64_t vec_isnonzero(__m128i a) { ++ __v2di x = (__v2di)a; ++ return x[0] | x[1]; ++} ++ ++#define VEC_ISZERO(a) !vec_isnonzero(a) ++#endif ++ ++#define VEC_ALIGNR8(a, b) _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b), _mm_castsi128_pd(a), 1)) ++ ++#define TRANSPOSE_FLOAT(a, b, c, d, e, f, g, h) \ ++ tmp0 = _mm_unpacklo_ps(a, b); \ ++ tmp1 = _mm_unpackhi_ps(a, b); \ ++ tmp2 = _mm_unpacklo_ps(c, d); \ ++ tmp3 = _mm_unpackhi_ps(c, d); \ ++ e = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); \ ++ f = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); \ ++ g = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); \ ++ h = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); ++ ++#define TRANSPOSE8(a, b) \ ++ b##0 = _mm_unpacklo_epi8(a##0, a##2); \ ++ b##1 = _mm_unpackhi_epi8(a##0, a##2); \ ++ b##2 = _mm_unpacklo_epi8(a##1, a##3); \ ++ b##3 = _mm_unpackhi_epi8(a##1, a##3); ++ ++#define TRANSPOSE16(a, b) \ ++ b##0 = _mm_unpacklo_epi16(a##0, a##2); \ ++ b##1 = _mm_unpackhi_epi16(a##0, a##2); \ ++ b##2 = _mm_unpacklo_epi16(a##1, a##3); \ ++ b##3 = _mm_unpackhi_epi16(a##1, a##3); \ ++ b##4 = _mm_unpacklo_epi16(a##4, a##6); \ ++ b##5 = _mm_unpackhi_epi16(a##4, a##6); \ ++ b##6 = _mm_unpacklo_epi16(a##5, a##7); \ ++ b##7 = _mm_unpackhi_epi16(a##5, a##7); ++ ++#define TRANSPOSE(a, b) \ ++ TRANSPOSE16(a, b) TRANSPOSE16(b, a) \ ++ b##0 = _mm_unpacklo_epi64(a##0, a##4); \ ++ b##1 = _mm_unpackhi_epi64(a##0, a##4); \ ++ b##2 = _mm_unpacklo_epi64(a##1, a##5); \ ++ b##3 = _mm_unpackhi_epi64(a##1, a##5); \ ++ b##4 = _mm_unpacklo_epi64(a##2, a##6); \ ++ b##5 = _mm_unpackhi_epi64(a##2, a##6); \ ++ b##6 = _mm_unpacklo_epi64(a##3, a##7); \ ++ b##7 = _mm_unpackhi_epi64(a##3, a##7); ++ ++#define IDCT_SAVE() { \ ++ __m128i pb_cj = _mm_set1_epi8((int8_t)CENTERJSAMPLE); \ ++ \ ++ row0 = _mm_xor_si128(_mm_packs_epi16(out0, out1), pb_cj); \ ++ row1 = _mm_xor_si128(_mm_packs_epi16(out2, out3), pb_cj); \ ++ row2 = _mm_xor_si128(_mm_packs_epi16(out4, out5), pb_cj); \ ++ row3 = _mm_xor_si128(_mm_packs_epi16(out6, out7), pb_cj); \ ++ \ ++ TRANSPOSE8(row, col) TRANSPOSE8(col, row) TRANSPOSE8(row, col) \ ++ \ ++ VEC_STL(output_buf[0] + output_col, col0); \ ++ VEC_STH(output_buf[1] + output_col, col0); \ ++ VEC_STL(output_buf[2] + output_col, col1); \ ++ VEC_STH(output_buf[3] + output_col, col1); \ ++ VEC_STL(output_buf[4] + output_col, col2); \ ++ VEC_STH(output_buf[5] + output_col, col2); \ ++ VEC_STL(output_buf[6] + output_col, col3); \ ++ VEC_STH(output_buf[7] + output_col, col3); \ ++} ++ ++#define IDCT_SPLAT8(col0) { \ ++ row3 = _mm_unpacklo_epi16(col0, col0); \ ++ row7 = _mm_unpackhi_epi16(col0, col0); \ ++ row1 = _mm_unpacklo_epi16(row3, row3); \ ++ row3 = _mm_unpackhi_epi16(row3, row3); \ ++ row5 = _mm_unpacklo_epi16(row7, row7); \ ++ row7 = _mm_unpackhi_epi16(row7, row7); \ ++ row0 = _mm_unpacklo_epi64(row1, row1); \ ++ row1 = _mm_unpackhi_epi64(row1, row1); \ ++ row2 = _mm_unpacklo_epi64(row3, row3); \ ++ row3 = _mm_unpackhi_epi64(row3, row3); \ ++ row4 = _mm_unpacklo_epi64(row5, row5); \ ++ row5 = _mm_unpackhi_epi64(row5, row5); \ ++ row6 = _mm_unpacklo_epi64(row7, row7); \ ++ row7 = _mm_unpackhi_epi64(row7, row7); \ ++} ++ ++#ifndef min ++#define min(a, b) ((a) < (b) ? (a) : (b)) ++#endif ++ ++#define VEC_LD(a) _mm_loadu_si128((const __m128i*)(a)) ++#define VEC_ST(a, b) _mm_storeu_si128((__m128i*)(a), b) ++#define VEC_LD8(a) _mm_loadl_epi64((const __m128i*)(a)) ++#define VEC_STL(a, b) _mm_storel_epi64((__m128i*)(a), b) ++#define VEC_STH(a, b) _mm_storeh_pd((double*)(a), _mm_castsi128_pd(b)); ++#define VEC_SPLAT(v, i) _mm_shuffle_epi8(v, _mm_set1_epi16((i) * 2 | ((i) * 2 + 1) << 8)) ++ ++#if !defined(__iset__) || __iset__ < 5 ++#define NEED_ALIGN8 ++#define ALIGN8_COMMON uint64_t src_shr; __m64 src_tmp0, src_tmp1; ++#define ALIGN8_VARS(src) __m64 *src##_ptr, src##_next, src##_index; ++#define ALIGN8_START(ptr, src) \ ++ src_shr = (intptr_t)(ptr - 1) & 7; \ ++ src##_ptr = (__m64*)((intptr_t)(ptr - 1) & -8); \ ++ src##_next = src##_ptr[src_shr == 7]; \ ++ src##_index = _mm_add_pi8(_mm_set1_pi8(src_shr), \ ++ _mm_setr_pi8(1, 2, 3, 4, 5, 6, 7, 8)); ++#define ALIGN8_READ16(v0, src, i) \ ++ src_tmp1 = src##_ptr[i * 2 + 1]; \ ++ src_tmp0 = _mm_shuffle2_pi8(src##_next, src_tmp1, src##_index); \ ++ src##_next = src##_ptr[i * 2 + 2]; \ ++ src_tmp1 = _mm_shuffle2_pi8(src_tmp1, src##_next, src##_index); \ ++ v0 = _mm_setr_epi64(src_tmp0, src_tmp1); ++#endif ++ +-- +2.34.1 + diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec index 6497ff7..751c31c 100644 --- a/libjpeg-turbo.spec +++ b/libjpeg-turbo.spec @@ -25,6 +25,7 @@ Source0: https://sourceforge.net/projects/libjpeg-turbo/files/%{version}/%{name} Source2: http://jpegclub.org/jpegexiforient.c Source3: http://jpegclub.org/exifautotran.txt Patch0: jpeg-6b-c++fixes.patch +Patch1: libjpeg-turbo-3.0.2-e2k.patch BuildRequires: cmake BuildRequires: libtool >= 1.4 %ifarch %{ix86} x86_64 @@ -159,6 +160,7 @@ have orientation markings in the EXIF data. %prep %setup -q %patch0 -p0 +%patch1 -p1 cp %{SOURCE2} jpegexiforient.c cp %{SOURCE3} exifautotran @@ -174,7 +176,7 @@ pushd jpeg8 CFLAGS="%{optflags} -Ofast -funroll-loops" \ %cmake ../.. -DWITH_JPEG8="True" -%make -s +%make_build -s popd # Build jpeg v6.2 API @@ -184,7 +186,7 @@ CFLAGS="%{optflags} -Ofast -funroll-loops" \ -DWITH_ARITH_DEC="True" \ -DWITH_ARITH_ENC="True" -%make -s +%make_build -s popd # Build jpegexiforient binary @@ -192,8 +194,8 @@ popd %install -%makeinstall_std -C jpeg8/build -%makeinstall_std -C jpeg62/build +%make_install -C jpeg8/build +%make_install -C jpeg62/build install -m755 jpegexiforient -D %{buildroot}%{_bindir}/jpegexiforient install -m755 exifautotran -D %{buildroot}%{_bindir}/exifautotran From 2c029484f08f9ab4e05d9bac1435e98abf06da31 Mon Sep 17 00:00:00 2001 From: Alexander Stefanov Date: Sat, 18 May 2024 12:13:00 +0000 Subject: [PATCH 28/32] 3.0.3 --- .abf.yml | 2 +- libjpeg-turbo-3.0.2-e2k.patch | 4852 +-------------------------------- libjpeg-turbo.spec | 4 +- 3 files changed, 9 insertions(+), 4849 deletions(-) diff --git a/.abf.yml b/.abf.yml index a52b704..3b105b8 100644 --- a/.abf.yml +++ b/.abf.yml @@ -1,2 +1,2 @@ sources: - libjpeg-turbo-3.0.2.tar.gz: b6c5d5081ced8502eb1e1e72f1f5cc2856ce90ee + 3.0.3.tar.gz: 397a31222105129c9e798efce2459c445048546e diff --git a/libjpeg-turbo-3.0.2-e2k.patch b/libjpeg-turbo-3.0.2-e2k.patch index a80f1d3..888624d 100644 --- a/libjpeg-turbo-3.0.2-e2k.patch +++ b/libjpeg-turbo-3.0.2-e2k.patch @@ -1,60 +1,8 @@ -From 5c6ff06bc9aec237e1ba222a3dde057cbfa81c9d Mon Sep 17 00:00:00 2001 -From: Ilya Kurdyukov -Date: Fri, 9 Feb 2024 09:57:55 +0700 -Subject: [PATCH] libjpeg-turbo-3.0.2 e2k support - ---- - CMakeLists.txt | 5 + - simd/CMakeLists.txt | 23 ++ - simd/e2k/jccolext-e2k.c | 213 +++++++++++ - simd/e2k/jccolor-e2k.c | 163 +++++++++ - simd/e2k/jchuff-e2k.c | 307 ++++++++++++++++ - simd/e2k/jcphuff-e2k.c | 145 ++++++++ - simd/e2k/jcsample-e2k.c | 203 +++++++++++ - simd/e2k/jcsample.h | 28 ++ - simd/e2k/jdcolext-e2k.c | 258 +++++++++++++ - simd/e2k/jdcolor-e2k.c | 289 +++++++++++++++ - simd/e2k/jdcoltab-e2k.c | 80 ++++ - simd/e2k/jdsample-e2k.c | 389 ++++++++++++++++++++ - simd/e2k/jfdctflt-e2k.c | 127 +++++++ - simd/e2k/jfdctfst-e2k.c | 145 ++++++++ - simd/e2k/jfdctint-e2k.c | 255 +++++++++++++ - simd/e2k/jidctflt-e2k.c | 215 +++++++++++ - simd/e2k/jidctfst-e2k.c | 187 ++++++++++ - simd/e2k/jidctint-e2k.c | 294 +++++++++++++++ - simd/e2k/jquantf-e2k.c | 121 +++++++ - simd/e2k/jquanti-e2k.c | 178 +++++++++ - simd/e2k/jsimd.c | 761 +++++++++++++++++++++++++++++++++++++++ - simd/e2k/jsimd_api_e2k.h | 94 +++++ - simd/e2k/jsimd_e2k.h | 207 +++++++++++ - 23 files changed, 4687 insertions(+) - create mode 100644 simd/e2k/jccolext-e2k.c - create mode 100644 simd/e2k/jccolor-e2k.c - create mode 100644 simd/e2k/jchuff-e2k.c - create mode 100644 simd/e2k/jcphuff-e2k.c - create mode 100644 simd/e2k/jcsample-e2k.c - create mode 100644 simd/e2k/jcsample.h - create mode 100644 simd/e2k/jdcolext-e2k.c - create mode 100644 simd/e2k/jdcolor-e2k.c - create mode 100644 simd/e2k/jdcoltab-e2k.c - create mode 100644 simd/e2k/jdsample-e2k.c - create mode 100644 simd/e2k/jfdctflt-e2k.c - create mode 100644 simd/e2k/jfdctfst-e2k.c - create mode 100644 simd/e2k/jfdctint-e2k.c - create mode 100644 simd/e2k/jidctflt-e2k.c - create mode 100644 simd/e2k/jidctfst-e2k.c - create mode 100644 simd/e2k/jidctint-e2k.c - create mode 100644 simd/e2k/jquantf-e2k.c - create mode 100644 simd/e2k/jquanti-e2k.c - create mode 100644 simd/e2k/jsimd.c - create mode 100644 simd/e2k/jsimd_api_e2k.h - create mode 100644 simd/e2k/jsimd_e2k.h - diff --git a/CMakeLists.txt b/CMakeLists.txt -index adb0ca4..3b445a0 100644 +index ff9c9c2..fa0364c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt -@@ -125,6 +125,9 @@ elseif(CMAKE_SYSTEM_PROCESSOR_LC STREQUAL "aarch64" OR +@@ -120,6 +120,9 @@ elseif(CMAKE_SYSTEM_PROCESSOR_LC STREQUAL "aarch64" OR elseif(CMAKE_SYSTEM_PROCESSOR_LC MATCHES "^ppc" OR CMAKE_SYSTEM_PROCESSOR_LC MATCHES "^powerpc") set(CPU_TYPE powerpc) @@ -64,20 +12,20 @@ index adb0ca4..3b445a0 100644 else() set(CPU_TYPE ${CMAKE_SYSTEM_PROCESSOR_LC}) endif() -@@ -906,6 +909,8 @@ if(CPU_TYPE STREQUAL "x86_64" OR CPU_TYPE STREQUAL "i386") +@@ -926,6 +929,8 @@ if(CPU_TYPE STREQUAL "x86_64" OR CPU_TYPE STREQUAL "i386") elseif(CPU_TYPE STREQUAL "x86_64") set(DEFAULT_FLOATTEST8 no-fp-contract) endif() +elseif(WITH_SIMD AND CPU_TYPE STREQUAL "e2k") + set(DEFAULT_FLOATTEST8 sse) elseif(CPU_TYPE STREQUAL "powerpc" OR CPU_TYPE STREQUAL "arm64") - if(CMAKE_C_COMPILER_ID STREQUAL "Clang") + if(CMAKE_C_COMPILER_ID MATCHES "Clang") if(CMAKE_C_COMPILER_VERSION VERSION_EQUAL 14.0.0 OR diff --git a/simd/CMakeLists.txt b/simd/CMakeLists.txt -index 6024900..789fa3f 100644 +index 0237955..e61baea 100644 --- a/simd/CMakeLists.txt +++ b/simd/CMakeLists.txt -@@ -531,6 +531,29 @@ if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED) +@@ -543,6 +543,29 @@ if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED) set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1) endif() @@ -107,4791 +55,3 @@ index 6024900..789fa3f 100644 ############################################################################### # None -diff --git a/simd/e2k/jccolext-e2k.c b/simd/e2k/jccolext-e2k.c -new file mode 100644 -index 0000000..49abdb4 ---- /dev/null -+++ b/simd/e2k/jccolext-e2k.c -@@ -0,0 +1,213 @@ -+/* -+ * Elbrus optimizations for libjpeg-turbo -+ * -+ * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. -+ * Copyright (C) 2014, Jay Foad. All Rights Reserved. -+ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd -+ * -+ * This software is provided 'as-is', without any express or implied -+ * warranty. In no event will the authors be held liable for any damages -+ * arising from the use of this software. -+ * -+ * Permission is granted to anyone to use this software for any purpose, -+ * including commercial applications, and to alter it and redistribute it -+ * freely, subject to the following restrictions: -+ * -+ * 1. The origin of this software must not be misrepresented; you must not -+ * claim that you wrote the original software. If you use this software -+ * in a product, an acknowledgment in the product documentation would be -+ * appreciated but is not required. -+ * 2. Altered source versions must be plainly marked as such, and must not be -+ * misrepresented as being the original software. -+ * 3. This notice may not be removed or altered from any source distribution. -+ */ -+ -+/* This file is included by jccolor-e2k.c */ -+ -+void rgbn_ycc_convert(JDIMENSION img_width, JSAMPARRAY input_buf, -+ JSAMPIMAGE output_buf, JDIMENSION output_row, -+ int num_rows, int shuf_idx) -+{ -+ JSAMPROW inptr, outptr0, outptr1, outptr2; -+ unsigned char __attribute__((aligned(16))) tmpbuf[PIXELSIZE * 16]; -+ -+ __m128i pb_zero = _mm_setzero_si128(); -+ __m128i pb_shuf0 = VEC_LD(rgb_ycc_shuf_const[shuf_idx]); -+#if PIXELSIZE == 4 -+ __m128i rgb3 = pb_zero; -+#else -+ __m128i pb_shuf4 = VEC_LD(rgb_ycc_shuf_const[shuf_idx] + 16); -+#endif -+ __m128i rgb0, rgb1 = pb_zero, rgb2 = pb_zero, -+ rgbg0, rgbg1, rgbg2, rgbg3, rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3; -+ __m128i y, yl, yh, y0, y1, y2, y3; -+ __m128i cb, cr, crl, crh, cbl, cbh; -+ __m128i cr0, cr1, cr2, cr3, cb0, cb1, cb2, cb3; -+ -+ /* Constants */ -+ __m128i pw_f0299_f0337 = _mm_setr_epi16(__4X2(F_0_299, F_0_337)), -+ pw_f0114_f0250 = _mm_setr_epi16(__4X2(F_0_114, F_0_250)), -+ pw_mf016_mf033 = _mm_setr_epi16(__4X2(-F_0_168, -F_0_331)), -+ pw_mf008_mf041 = _mm_setr_epi16(__4X2(-F_0_081, -F_0_418)), -+ pw_mf050_f000 = _mm_setr_epi16(__4X2(-F_0_500, 0)), -+ pd_onehalf = _mm_set1_epi32(ONE_HALF), -+ pd_onehalfm1_cj = _mm_set1_epi32(ONE_HALF - 1 + (CENTERJSAMPLE << SCALEBITS)); -+#ifdef NEED_ALIGN8 -+ ALIGN8_COMMON -+ ALIGN8_VARS(src) -+#endif -+ -+ if (img_width > 0) -+ while (--num_rows >= 0) { -+ int num_cols; -+ inptr = *input_buf++; -+ outptr0 = output_buf[0][output_row]; -+ outptr1 = output_buf[1][output_row]; -+ outptr2 = output_buf[2][output_row]; -+ output_row++; -+ -+ if (img_width >= 16) { -+#ifdef NEED_ALIGN8 -+ ALIGN8_START(inptr, src) -+ inptr += (img_width & -16) * PIXELSIZE; -+#endif -+ -+ PRAGMA_E2K("ivdep") -+ for (num_cols = img_width; num_cols >= 16; num_cols -= 16, -+ outptr0 += 16, outptr1 += 16, outptr2 += 16) { -+#ifdef NEED_ALIGN8 -+ ALIGN8_READ16(rgb0, src, 0) -+ ALIGN8_READ16(rgb1, src, 1) -+ ALIGN8_READ16(rgb2, src, 2) -+#if PIXELSIZE == 4 -+ ALIGN8_READ16(rgb3, src, 3) -+#endif -+ src_ptr += PIXELSIZE * 2; -+#else -+ rgb0 = VEC_LD(inptr); -+ rgb1 = VEC_LD(inptr + 16); -+ rgb2 = VEC_LD(inptr + 32); -+#if PIXELSIZE == 4 -+ rgb3 = VEC_LD(inptr + 48); -+#endif -+ inptr += PIXELSIZE * 16; -+#endif -+ RGB_SHUFFLE -+ CALC_Y(outptr0) -+ CALC_CC(outptr1, outptr2) -+ } -+ } -+ -+ num_cols = img_width & 15; -+ if (num_cols) { -+ int i; -+ memcpy(tmpbuf, inptr, num_cols * PIXELSIZE); -+ rgb0 = VEC_LD(tmpbuf); -+ rgb1 = VEC_LD(tmpbuf + 16); -+ rgb2 = VEC_LD(tmpbuf + 32); -+#if PIXELSIZE == 4 -+ rgb3 = VEC_LD(tmpbuf + 48); -+#endif -+ RGB_SHUFFLE -+ CALC_Y(tmpbuf) -+ CALC_CC(tmpbuf + 16, tmpbuf + 32) -+ -+ for (i = 0; i < num_cols; i++) { -+ outptr0[i] = tmpbuf[i]; -+ outptr1[i] = tmpbuf[i + 16]; -+ outptr2[i] = tmpbuf[i + 32]; -+ } -+ } -+ } -+} -+ -+void rgbn_gray_convert(JDIMENSION img_width, JSAMPARRAY input_buf, -+ JSAMPIMAGE output_buf, JDIMENSION output_row, -+ int num_rows, int shuf_idx) -+{ -+ JSAMPROW inptr, outptr; -+ uint8_t __attribute__((aligned(16))) tmpbuf[PIXELSIZE * 16]; -+ -+ __m128i pb_zero = _mm_setzero_si128(); -+ __m128i pb_shuf0 = VEC_LD(rgb_ycc_shuf_const[shuf_idx]); -+#if PIXELSIZE == 4 -+ __m128i rgb3 = pb_zero; -+#else -+ __m128i pb_shuf4 = VEC_LD(rgb_ycc_shuf_const[shuf_idx] + 16); -+#endif -+ __m128i rgb0, rgb1 = pb_zero, rgb2 = pb_zero, -+ rgbg0, rgbg1, rgbg2, rgbg3, rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3; -+ __m128i y, yl, yh, y0, y1, y2, y3; -+ -+ /* Constants */ -+ __m128i pw_f0299_f0337 = _mm_setr_epi16(__4X2(F_0_299, F_0_337)), -+ pw_f0114_f0250 = _mm_setr_epi16(__4X2(F_0_114, F_0_250)), -+ pd_onehalf = _mm_set1_epi32(ONE_HALF); -+#ifdef NEED_ALIGN8 -+ ALIGN8_COMMON -+ ALIGN8_VARS(src) -+#endif -+ -+ if (img_width > 0) -+ while (--num_rows >= 0) { -+ int num_cols; -+ inptr = *input_buf++; -+ outptr = output_buf[0][output_row]; -+ output_row++; -+ -+ if (img_width >= 16) { -+#ifdef NEED_ALIGN8 -+ ALIGN8_START(inptr, src) -+ inptr += (img_width & -16) * PIXELSIZE; -+#endif -+ -+ PRAGMA_E2K("ivdep") -+ for (num_cols = img_width; num_cols >= 16; num_cols -= 16, -+ outptr += 16) { -+#ifdef NEED_ALIGN8 -+ ALIGN8_READ16(rgb0, src, 0) -+ ALIGN8_READ16(rgb1, src, 1) -+ ALIGN8_READ16(rgb2, src, 2) -+#if PIXELSIZE == 4 -+ ALIGN8_READ16(rgb3, src, 3) -+#endif -+ src_ptr += PIXELSIZE * 2; -+#else -+ rgb0 = VEC_LD(inptr); -+ rgb1 = VEC_LD(inptr + 16); -+ rgb2 = VEC_LD(inptr + 32); -+#if PIXELSIZE == 4 -+ rgb3 = VEC_LD(inptr + 48); -+#endif -+ inptr += PIXELSIZE * 16; -+#endif -+ RGB_SHUFFLE -+ CALC_Y(outptr) -+ } -+ } -+ -+ num_cols = img_width & 15; -+ if (num_cols) { -+ int i; -+ memcpy(tmpbuf, inptr, num_cols * PIXELSIZE); -+ rgb0 = VEC_LD(tmpbuf); -+ rgb1 = VEC_LD(tmpbuf + 16); -+ rgb2 = VEC_LD(tmpbuf + 32); -+#if PIXELSIZE == 4 -+ rgb3 = VEC_LD(tmpbuf + 48); -+#endif -+ RGB_SHUFFLE -+ CALC_Y(tmpbuf) -+ -+ for (i = 0; i < num_cols; i++) { -+ outptr[i] = tmpbuf[i]; -+ } -+ } -+ } -+} -+ -+#undef RGB_SHUFFLE -+#undef PIXELSIZE -+#undef rgbn_ycc_convert -+#undef rgbn_gray_convert -+ -diff --git a/simd/e2k/jccolor-e2k.c b/simd/e2k/jccolor-e2k.c -new file mode 100644 -index 0000000..0af2626 ---- /dev/null -+++ b/simd/e2k/jccolor-e2k.c -@@ -0,0 +1,163 @@ -+/* -+ * Elbrus optimizations for libjpeg-turbo -+ * -+ * Copyright (C) 2014, D. R. Commander. All Rights Reserved. -+ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd -+ * -+ * This software is provided 'as-is', without any express or implied -+ * warranty. In no event will the authors be held liable for any damages -+ * arising from the use of this software. -+ * -+ * Permission is granted to anyone to use this software for any purpose, -+ * including commercial applications, and to alter it and redistribute it -+ * freely, subject to the following restrictions: -+ * -+ * 1. The origin of this software must not be misrepresented; you must not -+ * claim that you wrote the original software. If you use this software -+ * in a product, an acknowledgment in the product documentation would be -+ * appreciated but is not required. -+ * 2. Altered source versions must be plainly marked as such, and must not be -+ * misrepresented as being the original software. -+ * 3. This notice may not be removed or altered from any source distribution. -+ */ -+ -+/* RGB --> YCC CONVERSION */ -+ -+#include "jsimd_e2k.h" -+ -+#define F_0_081 5329 /* FIX(0.08131) */ -+#define F_0_114 7471 /* FIX(0.11400) */ -+#define F_0_168 11059 /* FIX(0.16874) */ -+#define F_0_250 16384 /* FIX(0.25000) */ -+#define F_0_299 19595 /* FIX(0.29900) */ -+#define F_0_331 21709 /* FIX(0.33126) */ -+#define F_0_418 27439 /* FIX(0.41869) */ -+#define F_0_500 32768 /* FIX(0.50000) */ -+#define F_0_587 38470 /* FIX(0.58700) */ -+#define F_0_337 (F_0_587 - F_0_250) /* FIX(0.58700) - FIX(0.25000) */ -+#define F_0_413 (65536 - F_0_587) /* FIX(1.00000) - FIX(0.58700) */ -+ -+#define SCALEBITS 16 -+#define ONE_HALF (1 << (SCALEBITS - 1)) -+ -+#define RGBG_INDEX_(name, color, i, x) \ -+ name##_##color + i * name##_PIXELSIZE + x, \ -+ name##_GREEN + i * name##_PIXELSIZE + x -+#define RGBG_INDEX(name, x) \ -+ RGBG_INDEX_(name, RED, 0, x), RGBG_INDEX_(name, RED, 1, x), \ -+ RGBG_INDEX_(name, RED, 2, x), RGBG_INDEX_(name, RED, 3, x), \ -+ RGBG_INDEX_(name, BLUE, 0, x), RGBG_INDEX_(name, BLUE, 1, x), \ -+ RGBG_INDEX_(name, BLUE, 2, x), RGBG_INDEX_(name, BLUE, 3, x) -+ -+static const uint8_t __attribute__((aligned(16))) -+rgb_ycc_shuf_const[7][32] = { -+ { RGBG_INDEX(RGB, 0), RGBG_INDEX(RGB, 4) }, -+ { RGBG_INDEX(EXT_RGB, 0), RGBG_INDEX(EXT_RGB, 4) }, -+ { RGBG_INDEX(EXT_RGBX, 0), RGBG_INDEX(EXT_RGBX, 4) }, -+ { RGBG_INDEX(EXT_BGR, 0), RGBG_INDEX(EXT_BGR, 4) }, -+ { RGBG_INDEX(EXT_BGRX, 0), RGBG_INDEX(EXT_BGRX, 4) }, -+ { RGBG_INDEX(EXT_XBGR, 0), RGBG_INDEX(EXT_XBGR, 4) }, -+ { RGBG_INDEX(EXT_XRGB, 0), RGBG_INDEX(EXT_XRGB, 4) } -+}; -+ -+ /* rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3 -+ * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7 -+ * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb -+ * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf -+ * -+ * rg0 = R0 G0 R1 G1 R2 G2 R3 G3 -+ * bg0 = B0 G0 B1 G1 B2 G2 B3 G3 -+ * ... -+ */ -+ -+ /* (Original) -+ * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B -+ * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE -+ * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE -+ * -+ * (This implementation) -+ * Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G -+ * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE -+ * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE -+ */ -+ -+#define CALC_Y(outptr0) \ -+ rg0 = _mm_unpacklo_epi8(rgbg0, pb_zero); \ -+ bg0 = _mm_unpackhi_epi8(rgbg0, pb_zero); \ -+ rg1 = _mm_unpacklo_epi8(rgbg1, pb_zero); \ -+ bg1 = _mm_unpackhi_epi8(rgbg1, pb_zero); \ -+ rg2 = _mm_unpacklo_epi8(rgbg2, pb_zero); \ -+ bg2 = _mm_unpackhi_epi8(rgbg2, pb_zero); \ -+ rg3 = _mm_unpacklo_epi8(rgbg3, pb_zero); \ -+ bg3 = _mm_unpackhi_epi8(rgbg3, pb_zero); \ -+ \ -+ /* Calculate Y values */ \ -+ y0 = _mm_add_epi32(_mm_madd_epi16(rg0, pw_f0299_f0337), pd_onehalf); \ -+ y1 = _mm_add_epi32(_mm_madd_epi16(rg1, pw_f0299_f0337), pd_onehalf); \ -+ y2 = _mm_add_epi32(_mm_madd_epi16(rg2, pw_f0299_f0337), pd_onehalf); \ -+ y3 = _mm_add_epi32(_mm_madd_epi16(rg3, pw_f0299_f0337), pd_onehalf); \ -+ y0 = _mm_add_epi32(_mm_madd_epi16(bg0, pw_f0114_f0250), y0); \ -+ y1 = _mm_add_epi32(_mm_madd_epi16(bg1, pw_f0114_f0250), y1); \ -+ y2 = _mm_add_epi32(_mm_madd_epi16(bg2, pw_f0114_f0250), y2); \ -+ y3 = _mm_add_epi32(_mm_madd_epi16(bg3, pw_f0114_f0250), y3); \ -+ \ -+ yl = _mm_packhi_epi32(y0, y1); \ -+ yh = _mm_packhi_epi32(y2, y3); \ -+ y = _mm_packus_epi16(yl, yh); \ -+ VEC_ST(outptr0, y); -+ -+#define CALC_CC(outptr1, outptr2) \ -+ /* Calculate Cb values */ \ -+ cb0 = _mm_add_epi32(_mm_madd_epi16(rg0, pw_mf016_mf033), pd_onehalfm1_cj); \ -+ cb1 = _mm_add_epi32(_mm_madd_epi16(rg1, pw_mf016_mf033), pd_onehalfm1_cj); \ -+ cb2 = _mm_add_epi32(_mm_madd_epi16(rg2, pw_mf016_mf033), pd_onehalfm1_cj); \ -+ cb3 = _mm_add_epi32(_mm_madd_epi16(rg3, pw_mf016_mf033), pd_onehalfm1_cj); \ -+ cb0 = _mm_sub_epi32(cb0, _mm_madd_epi16(bg0, pw_mf050_f000)); \ -+ cb1 = _mm_sub_epi32(cb1, _mm_madd_epi16(bg1, pw_mf050_f000)); \ -+ cb2 = _mm_sub_epi32(cb2, _mm_madd_epi16(bg2, pw_mf050_f000)); \ -+ cb3 = _mm_sub_epi32(cb3, _mm_madd_epi16(bg3, pw_mf050_f000)); \ -+ \ -+ cbl = _mm_packhi_epi32(cb0, cb1); \ -+ cbh = _mm_packhi_epi32(cb2, cb3); \ -+ cb = _mm_packus_epi16(cbl, cbh); \ -+ VEC_ST(outptr1, cb); \ -+ \ -+ /* Calculate Cr values */ \ -+ cr0 = _mm_add_epi32(_mm_madd_epi16(bg0, pw_mf008_mf041), pd_onehalfm1_cj); \ -+ cr1 = _mm_add_epi32(_mm_madd_epi16(bg1, pw_mf008_mf041), pd_onehalfm1_cj); \ -+ cr2 = _mm_add_epi32(_mm_madd_epi16(bg2, pw_mf008_mf041), pd_onehalfm1_cj); \ -+ cr3 = _mm_add_epi32(_mm_madd_epi16(bg3, pw_mf008_mf041), pd_onehalfm1_cj); \ -+ cr0 = _mm_sub_epi32(cr0, _mm_madd_epi16(rg0, pw_mf050_f000)); \ -+ cr1 = _mm_sub_epi32(cr1, _mm_madd_epi16(rg1, pw_mf050_f000)); \ -+ cr2 = _mm_sub_epi32(cr2, _mm_madd_epi16(rg2, pw_mf050_f000)); \ -+ cr3 = _mm_sub_epi32(cr3, _mm_madd_epi16(rg3, pw_mf050_f000)); \ -+ \ -+ crl = _mm_packhi_epi32(cr0, cr1); \ -+ crh = _mm_packhi_epi32(cr2, cr3); \ -+ cr = _mm_packus_epi16(crl, crh); \ -+ VEC_ST(outptr2, cr); -+ -+ -+#define PIXELSIZE 3 -+#define RGB_SHUFFLE \ -+ rgbg0 = _mm_shuffle_epi8(rgb0, pb_shuf0); \ -+ rgbg1 = _mm_shuffle_epi8(VEC_ALIGNR8(rgb1, rgb0), pb_shuf4); \ -+ rgbg2 = _mm_shuffle_epi8(VEC_ALIGNR8(rgb2, rgb1), pb_shuf0); \ -+ rgbg3 = _mm_shuffle_epi8(rgb2, pb_shuf4); -+ -+#define rgbn_ycc_convert jsimd_rgb3_ycc_convert_e2k -+#define rgbn_gray_convert jsimd_rgb3_gray_convert_e2k -+#include "jccolext-e2k.c" -+ -+ -+#define PIXELSIZE 4 -+#define RGB_SHUFFLE \ -+ rgbg0 = _mm_shuffle_epi8(rgb0, pb_shuf0); \ -+ rgbg1 = _mm_shuffle_epi8(rgb1, pb_shuf0); \ -+ rgbg2 = _mm_shuffle_epi8(rgb2, pb_shuf0); \ -+ rgbg3 = _mm_shuffle_epi8(rgb3, pb_shuf0); -+ -+#define rgbn_ycc_convert jsimd_rgb4_ycc_convert_e2k -+#define rgbn_gray_convert jsimd_rgb4_gray_convert_e2k -+#include "jccolext-e2k.c" -+ -diff --git a/simd/e2k/jchuff-e2k.c b/simd/e2k/jchuff-e2k.c -new file mode 100644 -index 0000000..ec4329e ---- /dev/null -+++ b/simd/e2k/jchuff-e2k.c -@@ -0,0 +1,307 @@ -+/* -+ * Elbrus optimizations for libjpeg-turbo -+ * -+ * Copyright (C) 2022, Ilya Kurdyukov for BaseALT, Ltd -+ * -+ * This software is provided 'as-is', without any express or implied -+ * warranty. In no event will the authors be held liable for any damages -+ * arising from the use of this software. -+ * -+ * Permission is granted to anyone to use this software for any purpose, -+ * including commercial applications, and to alter it and redistribute it -+ * freely, subject to the following restrictions: -+ * -+ * 1. The origin of this software must not be misrepresented; you must not -+ * claim that you wrote the original software. If you use this software -+ * in a product, an acknowledgment in the product documentation would be -+ * appreciated but is not required. -+ * 2. Altered source versions must be plainly marked as such, and must not be -+ * misrepresented as being the original software. -+ * 3. This notice may not be removed or altered from any source distribution. -+ * -+ * NOTE: All referenced figures are from -+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994. -+ */ -+ -+/* Encode a single block's worth of coefficients */ -+ -+#include "jsimd_e2k.h" -+ -+#if __SIZEOF_SIZE_T__ != 8 -+#error -+#endif -+ -+typedef unsigned long long bit_buf_type; -+#define BIT_BUF_SIZE 64 -+ -+typedef struct { -+ bit_buf_type put_buffer; /* current bit accumulation buffer */ -+ int free_bits; /* # of bits available in it */ -+ int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */ -+} savable_state; -+ -+typedef struct { -+ JOCTET *next_output_byte; /* => next byte to write in buffer */ -+ size_t free_in_buffer; /* # of byte spaces remaining in buffer */ -+ savable_state cur; /* Current bit buffer & DC state */ -+ j_compress_ptr cinfo; /* dump_buffer needs access to this */ -+ int simd; -+} working_state; -+ -+#define EMIT_BYTE(b) { \ -+ buffer[0] = (JOCTET)(b); \ -+ buffer[1] = 0; \ -+ buffer -= -2 + ((JOCTET)(b) < 0xFF); \ -+} -+ -+#define FLUSH() { \ -+ if (put_buffer & 0x8080808080808080 & ~(put_buffer + 0x0101010101010101)) { \ -+ EMIT_BYTE(put_buffer >> 56) \ -+ EMIT_BYTE(put_buffer >> 48) \ -+ EMIT_BYTE(put_buffer >> 40) \ -+ EMIT_BYTE(put_buffer >> 32) \ -+ EMIT_BYTE(put_buffer >> 24) \ -+ EMIT_BYTE(put_buffer >> 16) \ -+ EMIT_BYTE(put_buffer >> 8) \ -+ EMIT_BYTE(put_buffer ) \ -+ } else { \ -+ *(uint64_t*)buffer = __builtin_bswap64(put_buffer); \ -+ buffer += 8; \ -+ } \ -+} -+ -+#define PUT_AND_FLUSH(code, size) { \ -+ put_buffer = (put_buffer << (size + free_bits)) | (code >> -free_bits); \ -+ FLUSH() \ -+ free_bits += BIT_BUF_SIZE; \ -+ put_buffer = code; \ -+} -+ -+#define PUT_BITS(code, size) { \ -+ free_bits -= size; \ -+ if (free_bits < 0) \ -+ PUT_AND_FLUSH(code, size) \ -+ else \ -+ put_buffer = (put_buffer << size) | code; \ -+} -+ -+#define PUT_CODE(code, size) { \ -+ /* temp &= (((JLONG)1) << nbits) - 1; */ \ -+ /* temp |= code << nbits; */ \ -+ temp = __builtin_e2k_insfd(code, __builtin_e2k_insfd(nbits, 6 * 63 + 64, -nbits), temp); \ -+ nbits += size; \ -+ PUT_BITS(temp, nbits) \ -+} -+ -+#define KLOOP_PREPARE(mask, i) \ -+ t0 = _mm_cmpeq_epi8(_mm_packs_epi16(v0, v1), zero); \ -+ t1 = _mm_cmpeq_epi8(_mm_packs_epi16(v2, v3), zero); \ -+ mask = (uint32_t)(_mm_movemask_epi8(t0) | _mm_movemask_epi8(t1) << 16); \ -+ t0 = _mm_add_epi16(_mm_srai_epi16(v0, 15), v0); \ -+ t1 = _mm_add_epi16(_mm_srai_epi16(v1, 15), v1); \ -+ t2 = _mm_add_epi16(_mm_srai_epi16(v2, 15), v2); \ -+ t3 = _mm_add_epi16(_mm_srai_epi16(v3, 15), v3); \ -+ v0 = _mm_abs_epi16(v0); \ -+ v1 = _mm_abs_epi16(v1); \ -+ v2 = _mm_abs_epi16(v2); \ -+ v3 = _mm_abs_epi16(v3); \ -+ VEC_ST(block_nbits + i, v0); \ -+ VEC_ST(block_nbits + i + 8, v1); \ -+ VEC_ST(block_nbits + i + 16, v2); \ -+ VEC_ST(block_nbits + i + 24, v3); \ -+ VEC_ST(block_diff + i, t0); \ -+ VEC_ST(block_diff + i + 8, t1); \ -+ VEC_ST(block_diff + i + 16, t2); \ -+ VEC_ST(block_diff + i + 24, t3); -+ -+#define SHUF16X4(a, b, c, d) _mm_setr_pi8( \ -+ a * 2, a * 2 + 1, b * 2, b * 2 + 1, c * 2, c * 2 + 1, d * 2, d * 2 + 1) -+#define VEC_COMBINE(h0, h1) _mm_unpacklo_epi64( \ -+ _mm_movpi64_epi64(h0), _mm_movpi64_epi64(h1)) -+#define INSFI_M64(a, b, c, d) _mm_cvtsi64_m64(__builtin_e2k_insfd( \ -+ _mm_cvtm64_si64(a), (b & 63) | (d & 63) << 6, _mm_cvtm64_si64(c))) -+ -+GLOBAL(JOCTET *) -+jsimd_huff_encode_one_block_e2k(void *state, JOCTET *buffer, -+ JCOEFPTR block, int last_dc_val, -+ c_derived_tbl *dctbl, c_derived_tbl *actbl) { -+ uint64_t temp, nbits; -+ uint64_t i, r, code, size; -+ uint64_t code_0xf0 = actbl->ehufco[0xf0]; -+ uint64_t size_0xf0 = actbl->ehufsi[0xf0]; -+ -+ working_state *state_ptr = (working_state*)state; -+ bit_buf_type put_buffer = state_ptr->cur.put_buffer; -+ int64_t free_bits = state_ptr->cur.free_bits; -+ -+ __m128i zero = _mm_setzero_si128(); -+ __m128i v0, v1, v2, v3, t0, t1, t2, t3; -+ int64_t mask, mask1; -+ uint16_t __attribute__((aligned(16))) block_nbits[DCTSIZE2]; -+ int16_t __attribute__((aligned(16))) block_diff[DCTSIZE2]; -+ -+#if 1 /* faster this way */ -+ { -+ __m64 d0l, d0h, d1l, d1h, d2l, d2h, d3l, d3h; -+ __m64 d4l, d4h, d5l, d5h, d6l, d6h, d7l, d7h; -+ __m64 h0, h1, h2, h3, r0, r1, c1256 = SHUF16X4(1, 2, 5, 6); -+ -+ d0l = *(__m64*)(block + 8 * 0); d0h = *(__m64*)(block + 8 * 0 + 4); // 0 4 -+ d1l = *(__m64*)(block + 8 * 1); d1h = *(__m64*)(block + 8 * 1 + 4); // 8 12 -+ d2l = *(__m64*)(block + 8 * 2); d2h = *(__m64*)(block + 8 * 2 + 4); // 16 20 -+ d3l = *(__m64*)(block + 8 * 3); d3h = *(__m64*)(block + 8 * 3 + 4); // 24 28 -+ d4l = *(__m64*)(block + 8 * 4); d4h = *(__m64*)(block + 8 * 4 + 4); // 32 36 -+ d5l = *(__m64*)(block + 8 * 5); d5h = *(__m64*)(block + 8 * 5 + 4); // 40 44 -+ d6l = *(__m64*)(block + 8 * 6); d6h = *(__m64*)(block + 8 * 6 + 4); // 48 52 -+ d7l = *(__m64*)(block + 8 * 7); d7h = *(__m64*)(block + 8 * 7 + 4); // 56 60 -+ -+ // d0l[0] d0l[1] d1l[0] d2l[0] -+ // d1l[1] d0l[2] d0l[3] d1l[2] -+ h0 = _mm_unpacklo_pi16(d1l, d2l); -+ r0 = _mm_unpacklo_pi32(d0l, h0); -+ r1 = _mm_shuffle2_pi8(d1l, d0l, SHUF16X4(1, 6, 7, 2)); -+ r0 = _mm_sub_pi16(r0, _mm_cvtsi64_m64((uint16_t)last_dc_val)); -+ v0 = VEC_COMBINE(r0, r1); -+ -+ // d2l[1] d3l[0] d4l[0] d3l[1] -+ // d2l[2] d1l[3] d0h[0] d0h[1] -+ h0 = _mm_srli_si64(_mm_unpacklo_pi32(d2l, d4l), 16); -+ h2 = INSFI_M64(d1l, 0, d2l, 48); -+ r0 = _mm_unpacklo_pi16(h0, d3l); -+ r1 = _mm_alignr_pi8(d0h, h2, 4); -+ v1 = VEC_COMBINE(r0, r1); -+ -+ // d1h[0] d2l[3] d3l[2] d4l[1] -+ // d5l[0] d6l[0] d5l[1] d4l[2] -+ h0 = INSFI_M64(d2l, 32, d1h, 16); -+ h1 = INSFI_M64(d4l, -32, d3l, 48); -+ h2 = INSFI_M64(d4l, 16, d6l, 16); -+ r0 = INSFI_M64(h1, 0, h0, 32); -+ r1 = _mm_unpacklo_pi16(d5l, h2); -+ v2 = VEC_COMBINE(r0, r1); -+ -+ // d3l[3] d2h[0] d1h[1] d0h[2] -+ // d0h[3] d1h[2] d2h[1] d3h[0] -+ h0 = _mm_alignr_pi8(d2h, d3l, 6); -+ h1 = INSFI_M64(d0h, 0, d1h, 32); -+ h2 = _mm_unpackhi_pi32(d0h, d1h); -+ h3 = _mm_unpacklo_pi32(d2h, d3h); -+ r0 = INSFI_M64(h1, -16, h0, 32); -+ r1 = _mm_shuffle2_pi8(h2, h3, c1256); -+ v3 = VEC_COMBINE(r0, r1); -+ -+ KLOOP_PREPARE(mask, 0) -+ -+ // d4l[3] d5l[2] d6l[1] d7l[0] -+ // d7l[1] d6l[2] d5l[3] d4h[0] -+ h0 = _mm_unpackhi_pi32(d4l, d5l); -+ h1 = _mm_unpacklo_pi32(d6l, d7l); -+ h2 = INSFI_M64(d6l, 0, d7l, 32); -+ h2 = INSFI_M64(d5l, 0, h2, 48); -+ r0 = _mm_shuffle2_pi8(h0, h1, c1256); -+ r1 = _mm_alignr_pi8(d4h, h2, 2); -+ v0 = VEC_COMBINE(r0, r1); -+ -+ // d3h[1] d2h[2] d1h[3] d2h[3] -+ // d3h[2] d4h[1] d5h[0] d6l[3] -+ h0 = _mm_slli_si64(INSFI_M64(d1h, 16, d3h, 32), 16); -+ h2 = INSFI_M64(d4h, -32, d3h, 48); -+ h3 = INSFI_M64(d6l, 32, d5h, 16); -+ r0 = _mm_unpackhi_pi16(h0, d2h); -+ r1 = _mm_alignr_pi8(h3, h2, 4); -+ v1 = VEC_COMBINE(r0, r1); -+ -+ // d7l[2] d7l[3] d6h[0] d5h[1] -+ // d4h[2] d3h[3] d4h[3] d5h[2] -+ h0 = INSFI_M64(d5h, 0, d6h, 16); -+ h2 = _mm_slli_si64(_mm_unpackhi_pi32(d3h, d5h), 16); -+ r0 = _mm_alignr_pi8(h0, d7l, 4); -+ r1 = _mm_unpackhi_pi16(d4h, h2); -+ v2 = VEC_COMBINE(r0, r1); -+ -+ // d6h[1] d7h[0] d7h[1] d6h[2] -+ // d5h[3] d6h[3] d7h[2] d7h[3] -+ h0 = INSFI_M64(d6h, -16, d7h, 32); -+ h2 = _mm_unpackhi_pi16(d5h, d6h); -+ r0 = _mm_shuffle_pi16(h0, 0xd2); -+ r1 = _mm_unpackhi_pi32(h2, d7h); -+ v3 = VEC_COMBINE(r0, r1); -+ } -+#else -+ v0 = _mm_setr_epi16( -+ block[0] - last_dc_val, block[1], block[8], block[16], -+ block[9], block[2], block[3], block[10]); -+ v1 = _mm_setr_epi16( -+ block[17], block[24], block[32], block[25], -+ block[18], block[11], block[4], block[5]); -+ v2 = _mm_setr_epi16( -+ block[12], block[19], block[26], block[33], -+ block[40], block[48], block[41], block[34]); -+ v3 = _mm_setr_epi16( -+ block[27], block[20], block[13], block[6], -+ block[7], block[14], block[21], block[28]); -+ -+ KLOOP_PREPARE(mask, 0) -+ -+ v0 = _mm_setr_epi16( -+ block[35], block[42], block[49], block[56], -+ block[57], block[50], block[43], block[36]); -+ v1 = _mm_setr_epi16( -+ block[29], block[22], block[15], block[23], -+ block[30], block[37], block[44], block[51]); -+ v2 = _mm_setr_epi16( -+ block[58], block[59], block[52], block[45], -+ block[38], block[31], block[39], block[46]); -+ v3 = _mm_setr_epi16( -+ block[53], block[60], block[61], block[54], -+ block[47], block[55], block[62], block[63]); -+#endif -+ -+ KLOOP_PREPARE(mask1, 32) -+ mask |= mask1 << 32; -+ mask = ~mask; -+ -+ /* Encode the DC coefficient difference per section F.1.2.1 */ -+ -+ nbits = block_nbits[0]; -+ temp = block_diff[0]; -+ nbits = nbits ? 32 - __builtin_clz(nbits) : 0; -+ -+ /* Emit the Huffman-coded symbol for the number of bits */ -+ code = dctbl->ehufco[nbits]; -+ size = dctbl->ehufsi[nbits]; -+ PUT_CODE(code, size) -+ -+ /* Encode the AC coefficients per section F.1.2.2 */ -+ -+ /* e2k doesn't have a tzcnt instruction */ -+ mask = __builtin_e2k_bitrevd(mask) << 1; -+ -+ for (i = 1; mask; i++, mask <<= 1) { -+ r = __builtin_clzll(mask); -+ mask <<= r; -+ i += r; -+ /* if run length > 15, must emit special run-length-16 codes (0xF0) */ -+ while (r > 15) { -+ PUT_BITS(code_0xf0, size_0xf0) -+ r -= 16; -+ } -+ nbits = block_nbits[i]; -+ temp = block_diff[i]; -+ nbits = 32 - __builtin_clz(nbits); -+ /* Emit Huffman symbol for run length / number of bits */ -+ /* r = r << 4 | nbits; */ -+ r = __builtin_e2k_insfd(r, 4 * 63 + 64, nbits); -+ code = actbl->ehufco[r]; -+ size = actbl->ehufsi[r]; -+ PUT_CODE(code, size) -+ } -+ -+ if (i != 64) { -+ PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0]) -+ } -+ -+ state_ptr->cur.put_buffer = put_buffer; -+ state_ptr->cur.free_bits = free_bits; -+ return buffer; -+} -diff --git a/simd/e2k/jcphuff-e2k.c b/simd/e2k/jcphuff-e2k.c -new file mode 100644 -index 0000000..f69afeb ---- /dev/null -+++ b/simd/e2k/jcphuff-e2k.c -@@ -0,0 +1,145 @@ -+/* -+ * Elbrus optimizations for libjpeg-turbo -+ * -+ * Copyright (C) 2022, Ilya Kurdyukov for BaseALT, Ltd -+ * -+ * This software is provided 'as-is', without any express or implied -+ * warranty. In no event will the authors be held liable for any damages -+ * arising from the use of this software. -+ * -+ * Permission is granted to anyone to use this software for any purpose, -+ * including commercial applications, and to alter it and redistribute it -+ * freely, subject to the following restrictions: -+ * -+ * 1. The origin of this software must not be misrepresented; you must not -+ * claim that you wrote the original software. If you use this software -+ * in a product, an acknowledgment in the product documentation would be -+ * appreciated but is not required. -+ * 2. Altered source versions must be plainly marked as such, and must not be -+ * misrepresented as being the original software. -+ * 3. This notice may not be removed or altered from any source distribution. -+ */ -+ -+#include "jsimd_e2k.h" -+ -+#define X(i) coefs[i] = block[jpeg_natural_order_start[i]]; -+#define Y(i) coefs[i] = i < rem ? block[jpeg_natural_order_start[i]] : 0; -+ -+#define LOOP \ -+ for (i = 0; i < Sl >> 4; i++) { \ -+ X(0) X(1) X(2) X(3) X(4) X(5) X(6) X(7) \ -+ X(8) X(9) X(10) X(11) X(12) X(13) X(14) X(15) \ -+ BLOCK16 \ -+ jpeg_natural_order_start += 16; \ -+ } \ -+ rem = Sl & 15; \ -+ if (Sl & 8) { \ -+ X(0) X(1) X(2) X(3) X(4) X(5) X(6) X(7) \ -+ Y(8) Y(9) Y(10) Y(11) Y(12) Y(13) Y(14) \ -+ coefs[15] = 0; \ -+ BLOCK16 \ -+ } else if (rem > 0) { \ -+ Y(0) Y(1) Y(2) Y(3) Y(4) Y(5) Y(6) Y(7) \ -+ BLOCK8 \ -+ } -+ -+void jsimd_encode_mcu_AC_first_prepare_e2k -+ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, -+ JCOEF *values, size_t *zerobits) -+{ -+ JCOEF *diff = values + DCTSIZE2; -+ int16_t __attribute__((aligned(16))) coefs[16]; -+ __m128i v0, v1, v2, v3; -+ __m128i c0 = _mm_setzero_si128(), shr = _mm_cvtsi32_si128(Al); -+ int i, rem; -+ -+#define BLOCK16 \ -+ v0 = _mm_load_si128((__m128i*)coefs); \ -+ v1 = _mm_load_si128((__m128i*)coefs + 1); \ -+ v2 = _mm_srai_epi16(v0, 15); \ -+ v3 = _mm_srai_epi16(v1, 15); \ -+ v0 = _mm_sra_epi16(_mm_abs_epi16(v0), shr); \ -+ v1 = _mm_sra_epi16(_mm_abs_epi16(v1), shr); \ -+ v2 = _mm_xor_si128(v0, v2); \ -+ v3 = _mm_xor_si128(v1, v3); \ -+ _mm_store_si128((__m128i*)values, v0); \ -+ _mm_store_si128((__m128i*)values + 1, v1); \ -+ _mm_store_si128((__m128i*)diff, v2); \ -+ _mm_store_si128((__m128i*)diff + 1, v3); \ -+ v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, v1), c0); \ -+ ((uint16_t*)zerobits)[i] = ~_mm_movemask_epi8(v2); \ -+ values += 16; diff += 16; -+ -+#define BLOCK8 \ -+ v0 = _mm_load_si128((__m128i*)coefs); \ -+ v2 = _mm_srai_epi16(v0, 15); \ -+ v0 = _mm_sra_epi16(_mm_abs_epi16(v0), shr); \ -+ v2 = _mm_xor_si128(v0, v2); \ -+ _mm_store_si128((__m128i*)values, v0); \ -+ _mm_store_si128((__m128i*)diff, v2); \ -+ v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, c0), c0); \ -+ ((uint16_t*)zerobits)[i] = ~_mm_movemask_epi8(v2); \ -+ values += 8; diff += 8; -+ -+ ((uint64_t*)zerobits)[0] = 0; -+ LOOP -+#undef BLOCK16 -+#undef BLOCK8 -+ -+ for (i = (64 - Sl) >> 3; i; i--) { -+ _mm_store_si128((__m128i*)values, c0); -+ _mm_store_si128((__m128i*)diff, c0); -+ values += 8; diff += 8; -+ } -+} -+ -+int jsimd_encode_mcu_AC_refine_prepare_e2k -+ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, -+ JCOEF *absvalues, size_t *bits) -+{ -+ union { uint64_t q; uint16_t w[4]; } mask1 = { 0 }; -+ int16_t __attribute__((aligned(16))) coefs[16]; -+ __m128i v0, v1, v2, c1 = _mm_set1_epi8(1); -+ __m128i c0 = _mm_setzero_si128(), shr = _mm_cvtsi32_si128(Al); -+ int i, rem; -+ -+#define BLOCK16 \ -+ v0 = _mm_load_si128((__m128i*)coefs); \ -+ v1 = _mm_load_si128((__m128i*)coefs + 1); \ -+ v2 = _mm_packs_epi16(v0, v1); \ -+ ((uint16_t*)bits)[4 + i] = ~_mm_movemask_epi8(v2); \ -+ v0 = _mm_sra_epi16(_mm_abs_epi16(v0), shr); \ -+ v1 = _mm_sra_epi16(_mm_abs_epi16(v1), shr); \ -+ v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, v1), c0); \ -+ ((uint16_t*)bits)[i] = ~_mm_movemask_epi8(v2); \ -+ _mm_store_si128((__m128i*)absvalues, v0); \ -+ _mm_store_si128((__m128i*)absvalues + 1, v1); \ -+ v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, v1), c1); \ -+ mask1.w[i] = _mm_movemask_epi8(v2); \ -+ absvalues += 16; -+ -+#define BLOCK8 \ -+ v0 = _mm_load_si128((__m128i*)coefs); \ -+ v2 = _mm_packs_epi16(v0, c0); \ -+ ((uint16_t*)bits)[4 + i] = ~_mm_movemask_epi8(v2); \ -+ v0 = _mm_sra_epi16(_mm_abs_epi16(v0), shr); \ -+ v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, c0), c0); \ -+ ((uint16_t*)bits)[i] = ~_mm_movemask_epi8(v2); \ -+ _mm_store_si128((__m128i*)absvalues, v0); \ -+ v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, c0), c1); \ -+ mask1.w[i] = _mm_movemask_epi8(v2); \ -+ absvalues += 8; -+ -+ ((uint64_t*)bits)[0] = 0; /* zero */ -+ ((uint64_t*)bits)[1] = 0; /* sign */ -+ LOOP -+#undef BLOCK16 -+#undef BLOCK8 -+ -+ for (i = (64 - Sl) >> 3; i; i--) { -+ _mm_store_si128((__m128i*)absvalues, c0); -+ absvalues += 8; -+ } -+ -+ return 63 - __builtin_clzll(mask1.q | 1); -+} -diff --git a/simd/e2k/jcsample-e2k.c b/simd/e2k/jcsample-e2k.c -new file mode 100644 -index 0000000..cac8897 ---- /dev/null -+++ b/simd/e2k/jcsample-e2k.c -@@ -0,0 +1,203 @@ -+/* -+ * Elbrus optimizations for libjpeg-turbo -+ * -+ * Copyright (C) 2015, D. R. Commander. All Rights Reserved. -+ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd -+ * -+ * This software is provided 'as-is', without any express or implied -+ * warranty. In no event will the authors be held liable for any damages -+ * arising from the use of this software. -+ * -+ * Permission is granted to anyone to use this software for any purpose, -+ * including commercial applications, and to alter it and redistribute it -+ * freely, subject to the following restrictions: -+ * -+ * 1. The origin of this software must not be misrepresented; you must not -+ * claim that you wrote the original software. If you use this software -+ * in a product, an acknowledgment in the product documentation would be -+ * appreciated but is not required. -+ * 2. Altered source versions must be plainly marked as such, and must not be -+ * misrepresented as being the original software. -+ * 3. This notice may not be removed or altered from any source distribution. -+ */ -+ -+/* CHROMA DOWNSAMPLING */ -+ -+#include "jsimd_e2k.h" -+#include "jcsample.h" -+ -+void jsimd_h2v1_downsample_e2k(JDIMENSION image_width, -+ int max_v_samp_factor, -+ JDIMENSION v_samp_factor, -+ JDIMENSION width_in_blocks, -+ JSAMPARRAY input_data, -+ JSAMPARRAY output_data) -+{ -+ int outcol; -+ JDIMENSION output_cols = width_in_blocks * DCTSIZE, outrow; -+ JSAMPROW inptr, outptr; -+ -+ __m128i this0, next0, out; -+ __m128i this0e, this0o, next0e, next0o, outl, outh; -+ -+ /* Constants */ -+ __m128i pw_bias = _mm_set1_epi32(1 << 16), -+ even_mask = _mm_set1_epi16(255); -+#ifdef NEED_ALIGN8 -+ ALIGN8_COMMON -+ ALIGN8_VARS(src) -+#endif -+ -+ expand_right_edge(input_data, max_v_samp_factor, -+ image_width, output_cols * 2); -+ -+ if (output_cols > 0) -+ for (outrow = 0; outrow < v_samp_factor; outrow++) { -+ outptr = output_data[outrow]; -+ inptr = input_data[outrow]; -+ -+#ifdef NEED_ALIGN8 -+ ALIGN8_START(inptr, src) -+#endif -+ PRAGMA_E2K("ivdep") -+ for (outcol = output_cols; outcol > 8; -+ outcol -= 16, outptr += 16) { -+ -+#ifdef NEED_ALIGN8 -+ ALIGN8_READ16(this0, src, 0) -+ ALIGN8_READ16(next0, src, 1) -+ src_ptr += 4; -+#else -+ this0 = VEC_LD(inptr); -+ next0 = VEC_LD(inptr + 16); -+ inptr += 32; -+#endif -+ this0e = _mm_and_si128(this0, even_mask); -+ this0o = _mm_srli_epi16(this0, 8); -+ outl = _mm_add_epi16(this0e, this0o); -+ outl = _mm_srli_epi16(_mm_add_epi16(outl, pw_bias), 1); -+ next0e = _mm_and_si128(next0, even_mask); -+ next0o = _mm_srli_epi16(next0, 8); -+ outh = _mm_add_epi16(next0e, next0o); -+ outh = _mm_srli_epi16(_mm_add_epi16(outh, pw_bias), 1); -+ -+ out = _mm_packus_epi16(outl, outh); -+ VEC_ST(outptr, out); -+ } -+ if (outcol > 0) { -+#ifdef NEED_ALIGN8 -+ ALIGN8_READ16(this0, src, 0) -+#else -+ this0 = VEC_LD(inptr); -+#endif -+ this0e = _mm_and_si128(this0, even_mask); -+ this0o = _mm_srli_epi16(this0, 8); -+ outl = _mm_add_epi16(this0e, this0o); -+ outl = _mm_srli_epi16(_mm_add_epi16(outl, pw_bias), 1); -+ -+ out = _mm_packus_epi16(outl, outl); -+ VEC_STL(outptr, out); -+ } -+ } -+} -+ -+ -+void jsimd_h2v2_downsample_e2k(JDIMENSION image_width, int max_v_samp_factor, -+ JDIMENSION v_samp_factor, -+ JDIMENSION width_in_blocks, -+ JSAMPARRAY input_data, JSAMPARRAY output_data) -+{ -+ int outcol; -+ JDIMENSION output_cols = width_in_blocks * DCTSIZE, outrow; -+ JSAMPROW inptr0, inptr1, outptr; -+ -+ __m128i this0, next0, this1, next1, out; -+ __m128i this0e, this0o, next0e, next0o, this1e, this1o, -+ next1e, next1o, out0l, out0h, out1l, out1h, outl, outh; -+ -+ /* Constants */ -+ __m128i pw_bias = _mm_set1_epi32(1 | 2 << 16), -+ even_mask = _mm_set1_epi16(255); -+#ifdef NEED_ALIGN8 -+ ALIGN8_COMMON -+ ALIGN8_VARS(src0) -+ ALIGN8_VARS(src1) -+#endif -+ -+ expand_right_edge(input_data, max_v_samp_factor, -+ image_width, output_cols * 2); -+ -+ if (output_cols > 0) -+ for (outrow = 0; outrow < v_samp_factor; outrow++) { -+ inptr0 = input_data[outrow * 2]; -+ inptr1 = input_data[outrow * 2 + 1]; -+ outptr = output_data[outrow]; -+ -+#ifdef NEED_ALIGN8 -+ ALIGN8_START(inptr0, src0) -+ ALIGN8_START(inptr1, src1) -+#endif -+ PRAGMA_E2K("ivdep") -+ for (outcol = output_cols; outcol > 8; -+ outcol -= 16, outptr += 16) { -+ -+#ifdef NEED_ALIGN8 -+ ALIGN8_READ16(this0, src0, 0) src0_ptr += 2; -+ ALIGN8_READ16(this1, src1, 0) src1_ptr += 2; -+#else -+ this0 = VEC_LD(inptr0); inptr0 += 16; -+ this1 = VEC_LD(inptr1); inptr1 += 16; -+#endif -+ this0e = _mm_and_si128(this0, even_mask); -+ this1e = _mm_and_si128(this1, even_mask); -+ this0o = _mm_srli_epi16(this0, 8); -+ this1o = _mm_srli_epi16(this1, 8); -+ out0l = _mm_add_epi16(this0e, this0o); -+ out1l = _mm_add_epi16(this1e, this1o); -+ -+ outl = _mm_add_epi16(out0l, out1l); -+ outl = _mm_srli_epi16(_mm_add_epi16(outl, pw_bias), 2); -+ -+#ifdef NEED_ALIGN8 -+ ALIGN8_READ16(next0, src0, 0) src0_ptr += 2; -+ ALIGN8_READ16(next1, src1, 0) src1_ptr += 2; -+#else -+ next0 = VEC_LD(inptr0); inptr0 += 16; -+ next1 = VEC_LD(inptr1); inptr1 += 16; -+#endif -+ next0e = _mm_and_si128(next0, even_mask); -+ next1e = _mm_and_si128(next1, even_mask); -+ next0o = _mm_srli_epi16(next0, 8); -+ next1o = _mm_srli_epi16(next1, 8); -+ out0h = _mm_add_epi16(next0e, next0o); -+ out1h = _mm_add_epi16(next1e, next1o); -+ -+ outh = _mm_add_epi16(out0h, out1h); -+ outh = _mm_srli_epi16(_mm_add_epi16(outh, pw_bias), 2); -+ -+ out = _mm_packus_epi16(outl, outh); -+ VEC_ST(outptr, out); -+ } -+ if (outcol > 0) { -+#ifdef NEED_ALIGN8 -+ ALIGN8_READ16(this0, src0, 0) -+ ALIGN8_READ16(this1, src1, 0) -+#else -+ this0 = VEC_LD(inptr0); -+ this1 = VEC_LD(inptr1); -+#endif -+ this0e = _mm_and_si128(this0, even_mask); -+ this1e = _mm_and_si128(this1, even_mask); -+ this0o = _mm_srli_epi16(this0, 8); -+ this1o = _mm_srli_epi16(this1, 8); -+ out0l = _mm_add_epi16(this0e, this0o); -+ out1l = _mm_add_epi16(this1e, this1o); -+ -+ outl = _mm_add_epi16(out0l, out1l); -+ outl = _mm_srli_epi16(_mm_add_epi16(outl, pw_bias), 2); -+ -+ out = _mm_packus_epi16(outl, outl); -+ VEC_STL(outptr, out); -+ } -+ } -+} -diff --git a/simd/e2k/jcsample.h b/simd/e2k/jcsample.h -new file mode 100644 -index 0000000..2ac4816 ---- /dev/null -+++ b/simd/e2k/jcsample.h -@@ -0,0 +1,28 @@ -+/* -+ * jcsample.h -+ * -+ * This file was part of the Independent JPEG Group's software: -+ * Copyright (C) 1991-1996, Thomas G. Lane. -+ * For conditions of distribution and use, see the accompanying README.ijg -+ * file. -+ */ -+ -+LOCAL(void) -+expand_right_edge(JSAMPARRAY image_data, int num_rows, JDIMENSION input_cols, -+ JDIMENSION output_cols) -+{ -+ register JSAMPROW ptr; -+ register JSAMPLE pixval; -+ register int count; -+ int row; -+ int numcols = (int)(output_cols - input_cols); -+ -+ if (numcols > 0) { -+ for (row = 0; row < num_rows; row++) { -+ ptr = image_data[row] + input_cols; -+ pixval = ptr[-1]; /* don't need GETJSAMPLE() here */ -+ for (count = numcols; count > 0; count--) -+ *ptr++ = pixval; -+ } -+ } -+} -diff --git a/simd/e2k/jdcolext-e2k.c b/simd/e2k/jdcolext-e2k.c -new file mode 100644 -index 0000000..4f12aef ---- /dev/null -+++ b/simd/e2k/jdcolext-e2k.c -@@ -0,0 +1,258 @@ -+/* -+ * Elbrus optimizations for libjpeg-turbo -+ * -+ * Copyright (C) 2015, D. R. Commander. All Rights Reserved. -+ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd -+ * -+ * This software is provided 'as-is', without any express or implied -+ * warranty. In no event will the authors be held liable for any damages -+ * arising from the use of this software. -+ * -+ * Permission is granted to anyone to use this software for any purpose, -+ * including commercial applications, and to alter it and redistribute it -+ * freely, subject to the following restrictions: -+ * -+ * 1. The origin of this software must not be misrepresented; you must not -+ * claim that you wrote the original software. If you use this software -+ * in a product, an acknowledgment in the product documentation would be -+ * appreciated but is not required. -+ * 2. Altered source versions must be plainly marked as such, and must not be -+ * misrepresented as being the original software. -+ * 3. This notice may not be removed or altered from any source distribution. -+ */ -+ -+/* This file is included by jdcolor-e2k.c */ -+ -+void ycc_rgbn_convert(JDIMENSION out_width, JSAMPIMAGE input_buf, -+ JDIMENSION input_row, JSAMPARRAY output_buf, -+ int num_rows, int shuf_idx) -+{ -+ JSAMPROW outptr, inptr0, inptr1, inptr2; -+ uint8_t __attribute__((aligned(16))) tmpbuf[PIXELSIZE * 16]; -+ -+ __m128i rgb0, rgb1, rgb2, rgb3, y, cb, cr; -+ __m128i rg0, rg1, bx0, bx1, yl, yh, cbl, cbh, -+ crl, crh, rl, rh, gl, gh, bl, bh, g0w, g1w, g2w, g3w; -+ __m128i g0, g1, g2, g3; -+ -+ /* Constants -+ * NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17 -+ * high-order bits, not 16. -+ */ -+ __m128i pw_f0402 = _mm_set1_epi16(F_0_402 >> 1), -+ pw_mf0228 = _mm_set1_epi16(-F_0_228 >> 1), -+ pw_mf0344_f0285 = _mm_setr_epi16(__4X2(-F_0_344, F_0_285)), -+ pb_255 = _mm_set1_epi8(-1), -+ pw_cj = _mm_set1_epi16(CENTERJSAMPLE), -+ pd_onehalf = _mm_set1_epi32(ONE_HALF), -+ pb_zero = _mm_setzero_si128(); -+ RGB_SHUFFLE_INIT -+#ifdef NEED_ALIGN8 -+ ALIGN8_COMMON -+ ALIGN8_VARS(src0) -+ ALIGN8_VARS(src1) -+ ALIGN8_VARS(src2) -+#endif -+ -+ if (out_width > 0) -+ while (--num_rows >= 0) { -+ int num_cols; -+ inptr0 = input_buf[0][input_row]; -+ inptr1 = input_buf[1][input_row]; -+ inptr2 = input_buf[2][input_row]; -+ input_row++; -+ outptr = *output_buf++; -+ -+ if (out_width >= 16) { -+#ifdef NEED_ALIGN8 -+ ALIGN8_START(inptr0, src0) -+ ALIGN8_START(inptr1, src1) -+ ALIGN8_START(inptr2, src2) -+ inptr0 += out_width & -16; -+ inptr1 += out_width & -16; -+ inptr2 += out_width & -16; -+#endif -+ PRAGMA_E2K("ivdep") -+ for (num_cols = out_width; num_cols >= 16; -+ num_cols -= 16, outptr += PIXELSIZE * 16) { -+#ifdef NEED_ALIGN8 -+ ALIGN8_READ16(y, src0, 0) src0_ptr += 2; -+ ALIGN8_READ16(cb, src1, 0) src1_ptr += 2; -+ ALIGN8_READ16(cr, src2, 0) src2_ptr += 2; -+#else -+ y = VEC_LD(inptr0); inptr0 += 16; -+ cb = VEC_LD(inptr1); inptr1 += 16; -+ cr = VEC_LD(inptr2); inptr2 += 16; -+#endif -+ CALC_RGB -+ RGB_SHUFFLE -+ VEC_ST(outptr, rgb0); -+ VEC_ST(outptr + 16, rgb1); -+ VEC_ST(outptr + 32, rgb2); -+#if PIXELSIZE == 4 -+ VEC_ST(outptr + 48, rgb3); -+#endif -+ } -+ } -+ -+ num_cols = out_width & 15; -+ if (num_cols) { -+ int i; -+ for (i = 0; i < num_cols; i++) { -+ tmpbuf[i] = inptr0[i]; -+ tmpbuf[i + 16] = inptr1[i]; -+ tmpbuf[i + 32] = inptr2[i]; -+ } -+ y = VEC_LD(tmpbuf); -+ cb = VEC_LD(tmpbuf + 16); -+ cr = VEC_LD(tmpbuf + 32); -+ CALC_RGB -+ RGB_SHUFFLE -+ VEC_ST(tmpbuf, rgb0); -+ VEC_ST(tmpbuf + 16, rgb1); -+ VEC_ST(tmpbuf + 32, rgb2); -+#if PIXELSIZE == 4 -+ VEC_ST(tmpbuf + 48, rgb3); -+#endif -+ memcpy(outptr, tmpbuf, num_cols * PIXELSIZE); -+ } -+ } -+} -+ -+void ycc_rgbn_merged(JDIMENSION out_width, JSAMPIMAGE input_buf, -+ JDIMENSION in_row_group_ctr, -+ JDIMENSION in_row_group_ctr_y, -+ JSAMPARRAY output_buf, int shuf_idx) -+{ -+ JSAMPROW outptr, inptr0, inptr1, inptr2; -+ int num_cols; -+ uint8_t __attribute__((aligned(16))) tmpbuf[4 * 16]; -+ -+ __m128i rgb0, rgb1, rgb2, rgb3, y, cb, cr; -+ __m128i rg0, rg1, bx0, bx1, yl, yh, cbl, cbh, -+ crl, crh, r_yl, r_yh, g_yl, g_yh, b_yl, b_yh, g_y0w, g_y1w, g_y2w, g_y3w, -+ rl, rh, gl, gh, bl, bh; -+ __m128i g_y0, g_y1, g_y2, g_y3; -+ -+ /* Constants -+ * NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17 -+ * high-order bits, not 16. -+ */ -+ __m128i pw_f0402 = _mm_set1_epi16(F_0_402 >> 1), -+ pw_mf0228 = _mm_set1_epi16(-F_0_228 >> 1), -+ pw_mf0344_f0285 = _mm_setr_epi16(__4X2(-F_0_344, F_0_285)), -+ pb_255 = _mm_set1_epi8(-1), -+ pw_cj = _mm_set1_epi16(CENTERJSAMPLE), -+ pd_onehalf = _mm_set1_epi32(ONE_HALF), -+ pb_zero = _mm_setzero_si128(); -+ RGB_SHUFFLE_INIT -+#ifdef NEED_ALIGN8 -+ ALIGN8_COMMON -+ ALIGN8_VARS(src0) -+ ALIGN8_VARS(src1) -+ ALIGN8_VARS(src2) -+#endif -+ -+ inptr0 = input_buf[0][in_row_group_ctr_y]; -+ inptr1 = input_buf[1][in_row_group_ctr]; -+ inptr2 = input_buf[2][in_row_group_ctr]; -+ outptr = output_buf[0]; -+ -+ if (out_width >= 32) { -+#ifdef NEED_ALIGN8 -+ ALIGN8_START(inptr0, src0) -+ ALIGN8_START(inptr1, src1) -+ ALIGN8_START(inptr2, src2) -+ inptr0 += out_width & -32; -+ inptr1 += (out_width & -32) >> 1; -+ inptr2 += (out_width & -32) >> 1; -+#endif -+ PRAGMA_E2K("ivdep") -+ for (num_cols = out_width; num_cols >= 32; num_cols -= 32) { -+ -+#ifdef NEED_ALIGN8 -+ ALIGN8_READ16(cb, src1, 0) src1_ptr += 2; -+ ALIGN8_READ16(cr, src2, 0) src2_ptr += 2; -+#else -+ cb = VEC_LD(inptr1); inptr1 += 16; -+ cr = VEC_LD(inptr2); inptr2 += 16; -+#endif -+ CALC_MERGED1 -+ -+#ifdef NEED_ALIGN8 -+ ALIGN8_READ16(y, src0, 0) src0_ptr += 2; -+#else -+ y = VEC_LD(inptr0); inptr0 += 16; -+#endif -+ CALC_MERGED2(r_yl, g_yl, b_yl) -+ RGB_SHUFFLE -+ VEC_ST(outptr, rgb0); -+ VEC_ST(outptr + 16, rgb1); -+ VEC_ST(outptr + 32, rgb2); -+#if PIXELSIZE == 4 -+ VEC_ST(outptr + 48, rgb3); -+#endif -+ outptr += PIXELSIZE * 16; -+ -+#ifdef NEED_ALIGN8 -+ ALIGN8_READ16(y, src0, 0) src0_ptr += 2; -+#else -+ y = VEC_LD(inptr0); inptr0 += 16; -+#endif -+ CALC_MERGED2(r_yh, g_yh, b_yh) -+ RGB_SHUFFLE -+ VEC_ST(outptr, rgb0); -+ VEC_ST(outptr + 16, rgb1); -+ VEC_ST(outptr + 32, rgb2); -+#if PIXELSIZE == 4 -+ VEC_ST(outptr + 48, rgb3); -+#endif -+ outptr += PIXELSIZE * 16; -+ } -+ } -+ -+ num_cols = out_width & 31; -+ if (num_cols) { -+ int i; -+ for (i = 0; i < (num_cols + 1) >> 1; i++) { -+ tmpbuf[i] = inptr1[i]; -+ tmpbuf[i + 16] = inptr2[i]; -+ tmpbuf[i * 2 + 32] = inptr0[i * 2]; -+ tmpbuf[i * 2 + 32 + 1] = inptr0[i * 2 + 1]; -+ } -+ cb = VEC_LD(tmpbuf); -+ cr = VEC_LD(tmpbuf + 16); -+ CALC_MERGED1 -+ -+ y = VEC_LD(tmpbuf + 32); -+ CALC_MERGED2(r_yl, g_yl, b_yl) -+ RGB_SHUFFLE -+ if (num_cols >= 16) { -+ VEC_ST(outptr, rgb0); -+ VEC_ST(outptr + 16, rgb1); -+ VEC_ST(outptr + 32, rgb2); -+#if PIXELSIZE == 4 -+ VEC_ST(outptr + 48, rgb3); -+#endif -+ outptr += PIXELSIZE * 16; -+ -+ y = VEC_LD(tmpbuf + 48); -+ CALC_MERGED2(r_yh, g_yh, b_yh) -+ RGB_SHUFFLE -+ } -+ VEC_ST(tmpbuf, rgb0); -+ VEC_ST(tmpbuf + 16, rgb1); -+ VEC_ST(tmpbuf + 32, rgb2); -+#if PIXELSIZE == 4 -+ VEC_ST(tmpbuf + 48, rgb3); -+#endif -+ memcpy(outptr, tmpbuf, (out_width & 15) * PIXELSIZE); -+ } -+} -+ -+#undef RGB_SHUFFLE_INIT -+#undef RGB_SHUFFLE -+#undef PIXELSIZE -+#undef ycc_rgbn_convert -+#undef ycc_rgbn_merged -+ -diff --git a/simd/e2k/jdcolor-e2k.c b/simd/e2k/jdcolor-e2k.c -new file mode 100644 -index 0000000..94c80e9 ---- /dev/null -+++ b/simd/e2k/jdcolor-e2k.c -@@ -0,0 +1,289 @@ -+/* -+ * Elbrus optimizations for libjpeg-turbo -+ * -+ * Copyright (C) 2015, D. R. Commander. All Rights Reserved. -+ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd -+ * -+ * This software is provided 'as-is', without any express or implied -+ * warranty. In no event will the authors be held liable for any damages -+ * arising from the use of this software. -+ * -+ * Permission is granted to anyone to use this software for any purpose, -+ * including commercial applications, and to alter it and redistribute it -+ * freely, subject to the following restrictions: -+ * -+ * 1. The origin of this software must not be misrepresented; you must not -+ * claim that you wrote the original software. If you use this software -+ * in a product, an acknowledgment in the product documentation would be -+ * appreciated but is not required. -+ * 2. Altered source versions must be plainly marked as such, and must not be -+ * misrepresented as being the original software. -+ * 3. This notice may not be removed or altered from any source distribution. -+ */ -+ -+/* YCC --> RGB CONVERSION */ -+ -+#include "jsimd_e2k.h" -+ -+#define F_0_344 22554 /* FIX(0.34414) */ -+#define F_0_714 46802 /* FIX(0.71414) */ -+#define F_1_402 91881 /* FIX(1.40200) */ -+#define F_1_772 116130 /* FIX(1.77200) */ -+#define F_0_402 (F_1_402 - 65536) /* FIX(1.40200) - FIX(1) */ -+#define F_0_285 (65536 - F_0_714) /* FIX(1) - FIX(0.71414) */ -+#define F_0_228 (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */ -+ -+#define SCALEBITS 16 -+#define ONE_HALF (1 << (SCALEBITS - 1)) -+ -+static const uint8_t __attribute__((aligned(16))) -+#if defined(__iset__) && __iset__ >= 5 -+ycc_rgb_shuf_const[7][48] = { -+#define SHUF_CONST3 \ -+ C0, C1, C2, \ -+ C0 + 4, C1 + 4, C2 + 4, \ -+ C0 + 8, C1 + 8, C2 + 8, \ -+ C0 + 12, C1 + 12, C2 + 12, \ -+ C0 + 16, C1 + 16, C2 + 16, \ -+ C0 + 20, C1 + 4, C2 + 4, \ -+ C0 + 8, C1 + 8, C2 + 8, \ -+ C0 + 12, C1 + 12, C2 + 12, \ -+ C0 + 16, C1 + 16, C2 + 16, \ -+ C0 + 20, C1 + 20, C2 + 20, \ -+ C0 + 24, C1 + 24, C2 + 8, \ -+ C0 + 12, C1 + 12, C2 + 12, \ -+ C0 + 16, C1 + 16, C2 + 16, \ -+ C0 + 20, C1 + 20, C2 + 20, \ -+ C0 + 24, C1 + 24, C2 + 24, \ -+ C0 + 28, C1 + 28, C2 + 28 -+#else -+ycc_rgb_shuf_const[7][24] = { -+#define SHUF_CONST3 \ -+ C0, C1, C2, \ -+ C0 + 4, C1 + 4, C2 + 4, \ -+ C0 + 8, C1 + 8, C2, \ -+ C0 + 4, C1 + 4, C2 + 4, \ -+ C0 + 8, C1 + 8, C2 + 8, \ -+ C0 + 12, C1 + 4, C2 + 4, \ -+ C0 + 8, C1 + 8, C2 + 8, \ -+ C0 + 12, C1 + 12, C2 + 12 -+#endif -+ -+#define SHUF_CONST4 C0, C1, C2, C3, C0 + 4, C1 + 4, C2 + 4, C3 + 4, \ -+ C0 + 8, C1 + 8, C2 + 8, C3 + 8, C0 + 12, C1 + 12, C2 + 12, C3 + 12 -+ -+#define TMP_RED RGB_RED -+#define TMP_GREEN RGB_GREEN -+#define TMP_BLUE RGB_BLUE -+#define PIXELSIZE RGB_PIXELSIZE -+#include "jdcoltab-e2k.c" -+ , -+#define TMP_RED EXT_RGB_RED -+#define TMP_GREEN EXT_RGB_GREEN -+#define TMP_BLUE EXT_RGB_BLUE -+#define PIXELSIZE EXT_RGB_PIXELSIZE -+#include "jdcoltab-e2k.c" -+ , -+#define TMP_RED EXT_RGBX_RED -+#define TMP_GREEN EXT_RGBX_GREEN -+#define TMP_BLUE EXT_RGBX_BLUE -+#define PIXELSIZE EXT_RGBX_PIXELSIZE -+#include "jdcoltab-e2k.c" -+ , -+#define TMP_RED EXT_BGR_RED -+#define TMP_GREEN EXT_BGR_GREEN -+#define TMP_BLUE EXT_BGR_BLUE -+#define PIXELSIZE EXT_BGR_PIXELSIZE -+#include "jdcoltab-e2k.c" -+ , -+#define TMP_RED EXT_BGRX_RED -+#define TMP_GREEN EXT_BGRX_GREEN -+#define TMP_BLUE EXT_BGRX_BLUE -+#define PIXELSIZE EXT_BGRX_PIXELSIZE -+#include "jdcoltab-e2k.c" -+ , -+#define TMP_RED EXT_XBGR_RED -+#define TMP_GREEN EXT_XBGR_GREEN -+#define TMP_BLUE EXT_XBGR_BLUE -+#define PIXELSIZE EXT_XBGR_PIXELSIZE -+#include "jdcoltab-e2k.c" -+ , -+#define TMP_RED EXT_XRGB_RED -+#define TMP_GREEN EXT_XRGB_GREEN -+#define TMP_BLUE EXT_XRGB_BLUE -+#define PIXELSIZE EXT_XRGB_PIXELSIZE -+#include "jdcoltab-e2k.c" -+}; -+ -+ /* (Original) -+ * R = Y + 1.40200 * Cr -+ * G = Y - 0.34414 * Cb - 0.71414 * Cr -+ * B = Y + 1.77200 * Cb -+ * -+ * (This implementation) -+ * R = Y + 0.40200 * Cr + Cr -+ * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr -+ * B = Y - 0.22800 * Cb + Cb + Cb -+ */ -+ -+#define CALC_RGB \ -+ yl = _mm_unpacklo_epi8(y, pb_zero); \ -+ yh = _mm_unpackhi_epi8(y, pb_zero); \ -+ \ -+ cbl = _mm_unpacklo_epi8(cb, pb_zero); \ -+ cbh = _mm_unpackhi_epi8(cb, pb_zero); \ -+ cbl = _mm_sub_epi16(cbl, pw_cj); \ -+ cbh = _mm_sub_epi16(cbh, pw_cj); \ -+ \ -+ crl = _mm_unpacklo_epi8(cr, pb_zero); \ -+ crh = _mm_unpackhi_epi8(cr, pb_zero); \ -+ crl = _mm_sub_epi16(crl, pw_cj); \ -+ crh = _mm_sub_epi16(crh, pw_cj); \ -+ \ -+ bl = _mm_mulhrs_epi16(cbl, pw_mf0228); \ -+ bh = _mm_mulhrs_epi16(cbh, pw_mf0228); \ -+ bl = _mm_add_epi16(bl, _mm_add_epi16(cbl, cbl)); \ -+ bh = _mm_add_epi16(bh, _mm_add_epi16(cbh, cbh)); \ -+ bl = _mm_add_epi16(bl, yl); \ -+ bh = _mm_add_epi16(bh, yh); \ -+ \ -+ rl = _mm_mulhrs_epi16(crl, pw_f0402); \ -+ rh = _mm_mulhrs_epi16(crh, pw_f0402); \ -+ rl = _mm_add_epi16(rl, crl); \ -+ rh = _mm_add_epi16(rh, crh); \ -+ rl = _mm_add_epi16(rl, yl); \ -+ rh = _mm_add_epi16(rh, yh); \ -+ \ -+ g0w = _mm_unpacklo_epi16(cbl, crl); \ -+ g1w = _mm_unpackhi_epi16(cbl, crl); \ -+ g0 = _mm_add_epi32(_mm_madd_epi16(g0w, pw_mf0344_f0285), pd_onehalf); \ -+ g1 = _mm_add_epi32(_mm_madd_epi16(g1w, pw_mf0344_f0285), pd_onehalf); \ -+ g2w = _mm_unpacklo_epi16(cbh, crh); \ -+ g3w = _mm_unpackhi_epi16(cbh, crh); \ -+ g2 = _mm_add_epi32(_mm_madd_epi16(g2w, pw_mf0344_f0285), pd_onehalf); \ -+ g3 = _mm_add_epi32(_mm_madd_epi16(g3w, pw_mf0344_f0285), pd_onehalf); \ -+ \ -+ gl = _mm_packhi_epi32(g0, g1); \ -+ gh = _mm_packhi_epi32(g2, g3); \ -+ gl = _mm_sub_epi16(gl, crl); \ -+ gh = _mm_sub_epi16(gh, crh); \ -+ gl = _mm_add_epi16(gl, yl); \ -+ gh = _mm_add_epi16(gh, yh); \ -+ \ -+ rl = _mm_packus_epi16(rl, rh); \ -+ gl = _mm_packus_epi16(gl, gh); \ -+ bl = _mm_packus_epi16(bl, bh); \ -+ \ -+ rg0 = _mm_unpacklo_epi8(rl, gl); \ -+ rg1 = _mm_unpackhi_epi8(rl, gl); \ -+ bx0 = _mm_unpacklo_epi8(bl, pb_255); \ -+ bx1 = _mm_unpackhi_epi8(bl, pb_255); \ -+ \ -+ rgb0 = _mm_unpacklo_epi16(rg0, bx0); \ -+ rgb1 = _mm_unpackhi_epi16(rg0, bx0); \ -+ rgb2 = _mm_unpacklo_epi16(rg1, bx1); \ -+ rgb3 = _mm_unpackhi_epi16(rg1, bx1); -+ -+#define CALC_MERGED1 \ -+ cbl = _mm_unpacklo_epi8(cb, pb_zero); \ -+ cbh = _mm_unpackhi_epi8(cb, pb_zero); \ -+ cbl = _mm_sub_epi16(cbl, pw_cj); \ -+ cbh = _mm_sub_epi16(cbh, pw_cj); \ -+ \ -+ crl = _mm_unpacklo_epi8(cr, pb_zero); \ -+ crh = _mm_unpackhi_epi8(cr, pb_zero); \ -+ crl = _mm_sub_epi16(crl, pw_cj); \ -+ crh = _mm_sub_epi16(crh, pw_cj); \ -+ \ -+ b_yl = _mm_mulhrs_epi16(cbl, pw_mf0228); \ -+ b_yh = _mm_mulhrs_epi16(cbh, pw_mf0228); \ -+ b_yl = _mm_add_epi16(b_yl, _mm_add_epi16(cbl, cbl)); \ -+ b_yh = _mm_add_epi16(b_yh, _mm_add_epi16(cbh, cbh)); \ -+ \ -+ r_yl = _mm_mulhrs_epi16(crl, pw_f0402); \ -+ r_yh = _mm_mulhrs_epi16(crh, pw_f0402); \ -+ r_yl = _mm_add_epi16(r_yl, crl); \ -+ r_yh = _mm_add_epi16(r_yh, crh); \ -+ \ -+ g_y0w = _mm_unpacklo_epi16(cbl, crl); \ -+ g_y1w = _mm_unpackhi_epi16(cbl, crl); \ -+ g_y0 = _mm_add_epi32(_mm_madd_epi16(g_y0w, pw_mf0344_f0285), pd_onehalf); \ -+ g_y1 = _mm_add_epi32(_mm_madd_epi16(g_y1w, pw_mf0344_f0285), pd_onehalf); \ -+ g_y2w = _mm_unpacklo_epi16(cbh, crh); \ -+ g_y3w = _mm_unpackhi_epi16(cbh, crh); \ -+ g_y2 = _mm_add_epi32(_mm_madd_epi16(g_y2w, pw_mf0344_f0285), pd_onehalf); \ -+ g_y3 = _mm_add_epi32(_mm_madd_epi16(g_y3w, pw_mf0344_f0285), pd_onehalf); \ -+ \ -+ g_yl = _mm_packhi_epi32(g_y0, g_y1); \ -+ g_yh = _mm_packhi_epi32(g_y2, g_y3); \ -+ g_yl = _mm_sub_epi16(g_yl, crl); \ -+ g_yh = _mm_sub_epi16(g_yh, crh); -+ -+#define CALC_MERGED2(r_yl, g_yl, b_yl) \ -+ yl = _mm_unpacklo_epi8(y, pb_zero); \ -+ yh = _mm_unpackhi_epi8(y, pb_zero); \ -+ bl = _mm_add_epi16(_mm_unpacklo_epi16(b_yl, b_yl), yl); \ -+ bh = _mm_add_epi16(_mm_unpackhi_epi16(b_yl, b_yl), yh); \ -+ rl = _mm_add_epi16(_mm_unpacklo_epi16(r_yl, r_yl), yl); \ -+ rh = _mm_add_epi16(_mm_unpackhi_epi16(r_yl, r_yl), yh); \ -+ gl = _mm_add_epi16(_mm_unpacklo_epi16(g_yl, g_yl), yl); \ -+ gh = _mm_add_epi16(_mm_unpackhi_epi16(g_yl, g_yl), yh); \ -+ rl = _mm_packus_epi16(rl, rh); \ -+ gl = _mm_packus_epi16(gl, gh); \ -+ bl = _mm_packus_epi16(bl, bh); \ -+ \ -+ rg0 = _mm_unpacklo_epi8(rl, gl); \ -+ rg1 = _mm_unpackhi_epi8(rl, gl); \ -+ bx0 = _mm_unpacklo_epi8(bl, pb_255); \ -+ bx1 = _mm_unpackhi_epi8(bl, pb_255); \ -+ \ -+ rgb0 = _mm_unpacklo_epi16(rg0, bx0); \ -+ rgb1 = _mm_unpackhi_epi16(rg0, bx0); \ -+ rgb2 = _mm_unpacklo_epi16(rg1, bx1); \ -+ rgb3 = _mm_unpackhi_epi16(rg1, bx1); -+ -+#define PIXELSIZE 3 -+#if defined(__iset__) && __iset__ >= 5 -+#define RGB_SHUFFLE_INIT __m128i \ -+ rgb_index0 = VEC_LD(ycc_rgb_shuf_const[shuf_idx]), \ -+ rgb_index1 = VEC_LD(ycc_rgb_shuf_const[shuf_idx] + 16), \ -+ rgb_index2 = VEC_LD(ycc_rgb_shuf_const[shuf_idx] + 32); -+#define RGB_SHUFFLE \ -+ rgb0 = _mm_shuffle2_epi8(rgb0, rgb1, rgb_index0); \ -+ rgb1 = _mm_shuffle2_epi8(rgb1, rgb2, rgb_index1); \ -+ rgb2 = _mm_shuffle2_epi8(rgb2, rgb3, rgb_index2); -+#else -+#define RGB_SHUFFLE_INIT __m64 \ -+ rgb_index0 = *(__m64*)ycc_rgb_shuf_const[shuf_idx], \ -+ rgb_index1 = *(__m64*)(ycc_rgb_shuf_const[shuf_idx] + 8), \ -+ rgb_index2 = *(__m64*)(ycc_rgb_shuf_const[shuf_idx] + 16); -+#define RGB_SHUFFLE { \ -+ union { __m128i v; __m64 d[2]; } a = { rgb0 }, \ -+ b = { rgb1 }, c = { rgb2 }, d = { rgb3 }; \ -+ a.d[0] = _mm_shuffle2_pi8(a.d[0], a.d[1], rgb_index0); \ -+ a.d[1] = _mm_shuffle2_pi8(a.d[1], b.d[0], rgb_index1); \ -+ b.d[0] = _mm_shuffle2_pi8(b.d[0], b.d[1], rgb_index2); \ -+ b.d[1] = _mm_shuffle2_pi8(c.d[0], c.d[1], rgb_index0); \ -+ c.d[0] = _mm_shuffle2_pi8(c.d[1], d.d[0], rgb_index1); \ -+ c.d[1] = _mm_shuffle2_pi8(d.d[0], d.d[1], rgb_index2); \ -+ rgb0 = a.v; rgb1 = b.v; rgb2 = c.v; \ -+} -+#endif -+ -+#define ycc_rgbn_convert jsimd_ycc_rgb3_convert_e2k -+#define ycc_rgbn_merged jsimd_ycc_rgb3_merged_upsample_e2k -+#include "jdcolext-e2k.c" -+ -+#define PIXELSIZE 4 -+#define RGB_SHUFFLE_INIT __m128i \ -+ rgb_index0 = VEC_LD(ycc_rgb_shuf_const[shuf_idx]); -+#define RGB_SHUFFLE \ -+ rgb0 = _mm_shuffle_epi8(rgb0, rgb_index0); \ -+ rgb1 = _mm_shuffle_epi8(rgb1, rgb_index0); \ -+ rgb2 = _mm_shuffle_epi8(rgb2, rgb_index0); \ -+ rgb3 = _mm_shuffle_epi8(rgb3, rgb_index0); -+ -+#define ycc_rgbn_convert jsimd_ycc_rgb4_convert_e2k -+#define ycc_rgbn_merged jsimd_ycc_rgb4_merged_upsample_e2k -+#include "jdcolext-e2k.c" -+ -diff --git a/simd/e2k/jdcoltab-e2k.c b/simd/e2k/jdcoltab-e2k.c -new file mode 100644 -index 0000000..e19666d ---- /dev/null -+++ b/simd/e2k/jdcoltab-e2k.c -@@ -0,0 +1,80 @@ -+/* -+ * Elbrus optimizations for libjpeg-turbo -+ * -+ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd -+ * -+ * This software is provided 'as-is', without any express or implied -+ * warranty. In no event will the authors be held liable for any damages -+ * arising from the use of this software. -+ * -+ * Permission is granted to anyone to use this software for any purpose, -+ * including commercial applications, and to alter it and redistribute it -+ * freely, subject to the following restrictions: -+ * -+ * 1. The origin of this software must not be misrepresented; you must not -+ * claim that you wrote the original software. If you use this software -+ * in a product, an acknowledgment in the product documentation would be -+ * appreciated but is not required. -+ * 2. Altered source versions must be plainly marked as such, and must not be -+ * misrepresented as being the original software. -+ * 3. This notice may not be removed or altered from any source distribution. -+ */ -+ -+/* This file is included by jdcolor-e2k.c */ -+ -+#if TMP_RED == 0 -+#define C0 0 -+#elif TMP_GREEN == 0 -+#define C0 1 -+#elif TMP_BLUE == 0 -+#define C0 2 -+#else -+#define C0 3 -+#endif -+ -+#if TMP_RED == 1 -+#define C1 0 -+#elif TMP_GREEN == 1 -+#define C1 1 -+#elif TMP_BLUE == 1 -+#define C1 2 -+#else -+#define C1 3 -+#endif -+ -+#if TMP_RED == 2 -+#define C2 0 -+#elif TMP_GREEN == 2 -+#define C2 1 -+#elif TMP_BLUE == 2 -+#define C2 2 -+#else -+#define C2 3 -+#endif -+ -+#if TMP_RED == 3 -+#define C3 0 -+#elif TMP_GREEN == 3 -+#define C3 1 -+#elif TMP_BLUE == 3 -+#define C3 2 -+#else -+#define C3 3 -+#endif -+ -+#if PIXELSIZE == 3 -+{ SHUF_CONST3 } -+#else -+{ SHUF_CONST4 } -+#endif -+ -+#undef C0 -+#undef C1 -+#undef C2 -+#undef C3 -+ -+#undef TMP_RED -+#undef TMP_GREEN -+#undef TMP_BLUE -+#undef PIXELSIZE -+ -diff --git a/simd/e2k/jdsample-e2k.c b/simd/e2k/jdsample-e2k.c -new file mode 100644 -index 0000000..572b3af ---- /dev/null -+++ b/simd/e2k/jdsample-e2k.c -@@ -0,0 +1,389 @@ -+/* -+ * Elbrus optimizations for libjpeg-turbo -+ * -+ * Copyright (C) 2015, D. R. Commander. All Rights Reserved. -+ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd -+ * -+ * This software is provided 'as-is', without any express or implied -+ * warranty. In no event will the authors be held liable for any damages -+ * arising from the use of this software. -+ * -+ * Permission is granted to anyone to use this software for any purpose, -+ * including commercial applications, and to alter it and redistribute it -+ * freely, subject to the following restrictions: -+ * -+ * 1. The origin of this software must not be misrepresented; you must not -+ * claim that you wrote the original software. If you use this software -+ * in a product, an acknowledgment in the product documentation would be -+ * appreciated but is not required. -+ * 2. Altered source versions must be plainly marked as such, and must not be -+ * misrepresented as being the original software. -+ * 3. This notice may not be removed or altered from any source distribution. -+ */ -+ -+/* CHROMA UPSAMPLING */ -+ -+#include "jsimd_e2k.h" -+ -+ -+void jsimd_h2v1_fancy_upsample_e2k(int max_v_samp_factor, -+ JDIMENSION downsampled_width, -+ JSAMPARRAY input_data, -+ JSAMPARRAY *output_data_ptr) -+{ -+ JSAMPARRAY output_data = *output_data_ptr; -+ JSAMPROW inptr, outptr; -+ int inrow, incol; -+ -+ __m128i pb_zero = _mm_setzero_si128(); -+ __m128i this0, last0, p_last0, next0 = pb_zero, p_next0, out; -+ __m128i this0l, this0h, last0l, last0h, -+ next0l, next0h, outle, outhe, outlo, outho; -+#ifdef NEED_ALIGN8 -+ ALIGN8_COMMON -+ ALIGN8_VARS(src) -+#endif -+ -+ /* Constants */ -+ __m128i pw_three = _mm_set1_epi16(3), -+ next_index_lastcol = _mm_setr_epi8( -+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15), -+ pw_one = _mm_set1_epi16(1), pw_two = _mm_set1_epi16(2); -+ -+ if (downsampled_width > 0) -+ for (inrow = 0; inrow < max_v_samp_factor; inrow++) { -+ inptr = input_data[inrow]; -+ outptr = output_data[inrow]; -+ -+ if (downsampled_width & 15) -+ inptr[downsampled_width] = inptr[downsampled_width - 1]; -+ -+#ifdef NEED_ALIGN8 -+ ALIGN8_START(inptr, src) -+ ALIGN8_READ16(this0, src, 0) -+#else -+ this0 = VEC_LD(inptr); -+#endif -+ last0 = _mm_bslli_si128(this0, 15); -+ -+ PRAGMA_E2K("ivdep") -+ for (incol = downsampled_width; incol > 0; -+ incol -= 16, outptr += 32) { -+ -+ p_last0 = _mm_alignr_epi8(this0, last0, 15); -+ last0 = this0; -+ -+ if (__builtin_expect(incol <= 16, 0)) -+ p_next0 = _mm_shuffle_epi8(this0, next_index_lastcol); -+ else { -+#ifdef NEED_ALIGN8 -+ ALIGN8_READ16(next0, src, 1) src_ptr += 2; -+#else -+ next0 = VEC_LD(inptr + 16); inptr += 16; -+#endif -+ p_next0 = _mm_alignr_epi8(next0, this0, 1); -+ } -+ -+ this0l = _mm_mullo_epi16(_mm_unpacklo_epi8(this0, pb_zero), pw_three); -+ last0l = _mm_unpacklo_epi8(p_last0, pb_zero); -+ next0l = _mm_unpacklo_epi8(p_next0, pb_zero); -+ last0l = _mm_add_epi16(last0l, pw_one); -+ next0l = _mm_add_epi16(next0l, pw_two); -+ -+ outle = _mm_add_epi16(this0l, last0l); -+ outlo = _mm_add_epi16(this0l, next0l); -+ outle = _mm_srli_epi16(outle, 2); -+ outlo = _mm_srli_epi16(outlo, 2); -+ -+ out = _mm_or_si128(outle, _mm_slli_epi16(outlo, 8)); -+ VEC_ST(outptr, out); -+ -+ if (__builtin_expect(incol <= 8, 0)) break; -+ -+ this0h = _mm_mullo_epi16(_mm_unpackhi_epi8(this0, pb_zero), pw_three); -+ last0h = _mm_unpackhi_epi8(p_last0, pb_zero); -+ next0h = _mm_unpackhi_epi8(p_next0, pb_zero); -+ last0h = _mm_add_epi16(last0h, pw_one); -+ next0h = _mm_add_epi16(next0h, pw_two); -+ -+ outhe = _mm_add_epi16(this0h, last0h); -+ outho = _mm_add_epi16(this0h, next0h); -+ outhe = _mm_srli_epi16(outhe, 2); -+ outho = _mm_srli_epi16(outho, 2); -+ -+ out = _mm_or_si128(outhe, _mm_slli_epi16(outho, 8)); -+ VEC_ST(outptr + 16, out); -+ -+ this0 = next0; -+ } -+ } -+} -+ -+ -+void jsimd_h2v2_fancy_upsample_e2k(int max_v_samp_factor, -+ JDIMENSION downsampled_width, -+ JSAMPARRAY input_data, -+ JSAMPARRAY *output_data_ptr) -+{ -+ JSAMPARRAY output_data = *output_data_ptr; -+ JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1; -+ int inrow, outrow, incol; -+#ifdef NEED_ALIGN8 -+ ALIGN8_COMMON -+ ALIGN8_VARS(src_1) -+ ALIGN8_VARS(src0) -+ ALIGN8_VARS(src1) -+#endif -+ -+ __m128i pb_zero = _mm_setzero_si128(); -+ __m128i this_1, this0, this1, out; -+ __m128i this_1l, this_1h, this0l, this0h, this1l, this1h, -+ lastcolsum_1h, lastcolsum1h, -+ p_lastcolsum_1l, p_lastcolsum_1h, p_lastcolsum1l, p_lastcolsum1h, -+ thiscolsum_1l, thiscolsum_1h, thiscolsum1l, thiscolsum1h, -+ nextcolsum_1l = pb_zero, nextcolsum_1h = pb_zero, -+ nextcolsum1l = pb_zero, nextcolsum1h = pb_zero, -+ p_nextcolsum_1l, p_nextcolsum_1h, p_nextcolsum1l, p_nextcolsum1h, -+ tmpl, tmph, outle, outhe, outlo, outho; -+ -+ /* Constants */ -+ __m128i pw_three = _mm_set1_epi16(3), -+ pw_seven = _mm_set1_epi16(7), pw_eight = _mm_set1_epi16(8), -+ next_index_lastcol = _mm_setr_epi8( -+ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14, 15); -+ -+ if (downsampled_width > 0) -+ for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) { -+ -+ inptr_1 = input_data[inrow - 1]; -+ inptr0 = input_data[inrow]; -+ inptr1 = input_data[inrow + 1]; -+ outptr0 = output_data[outrow++]; -+ outptr1 = output_data[outrow++]; -+ -+ if (downsampled_width & 15) { -+ inptr_1[downsampled_width] = inptr_1[downsampled_width - 1]; -+ inptr0[downsampled_width] = inptr0[downsampled_width - 1]; -+ inptr1[downsampled_width] = inptr1[downsampled_width - 1]; -+ } -+ -+#ifdef NEED_ALIGN8 -+ ALIGN8_START(inptr0, src0) -+ ALIGN8_START(inptr_1, src_1) -+ ALIGN8_START(inptr1, src1) -+ ALIGN8_READ16(this0, src0, 0) -+ ALIGN8_READ16(this_1, src_1, 0) -+ ALIGN8_READ16(this1, src1, 0) -+#else -+ this0 = VEC_LD(inptr0); -+ this_1 = VEC_LD(inptr_1); -+ this1 = VEC_LD(inptr1); -+#endif -+ -+ this0l = _mm_unpacklo_epi8(this0, pb_zero); -+ this0h = _mm_unpackhi_epi8(this0, pb_zero); -+ this0l = _mm_mullo_epi16(this0l, pw_three); -+ this0h = _mm_mullo_epi16(this0h, pw_three); -+ -+ this_1l = _mm_unpacklo_epi8(this_1, pb_zero); -+ this_1h = _mm_unpackhi_epi8(this_1, pb_zero); -+ thiscolsum_1l = _mm_add_epi16(this0l, this_1l); -+ thiscolsum_1h = _mm_add_epi16(this0h, this_1h); -+ lastcolsum_1h = _mm_bslli_si128(thiscolsum_1l, 14);; -+ -+ this1l = _mm_unpacklo_epi8(this1, pb_zero); -+ this1h = _mm_unpackhi_epi8(this1, pb_zero); -+ thiscolsum1l = _mm_add_epi16(this0l, this1l); -+ thiscolsum1h = _mm_add_epi16(this0h, this1h); -+ lastcolsum1h = _mm_bslli_si128(thiscolsum1l, 14); -+ -+ PRAGMA_E2K("ivdep") -+ for (incol = downsampled_width; incol > 0; -+ incol -= 16, outptr0 += 32, outptr1 += 32) { -+ -+ p_lastcolsum_1l = _mm_alignr_epi8(thiscolsum_1l, lastcolsum_1h, 14); -+ p_lastcolsum_1h = _mm_alignr_epi8(thiscolsum_1h, thiscolsum_1l, 14); -+ p_lastcolsum1l = _mm_alignr_epi8(thiscolsum1l, lastcolsum1h, 14); -+ p_lastcolsum1h = _mm_alignr_epi8(thiscolsum1h, thiscolsum1l, 14); -+ lastcolsum_1h = thiscolsum_1h; -+ lastcolsum1h = thiscolsum1h; -+ -+ if (__builtin_expect(incol <= 16, 0)) { -+ p_nextcolsum_1l = _mm_alignr_epi8(thiscolsum_1h, thiscolsum_1l, 2); -+ p_nextcolsum_1h = _mm_shuffle_epi8(thiscolsum_1h, next_index_lastcol); -+ p_nextcolsum1l = _mm_alignr_epi8(thiscolsum1h, thiscolsum1l, 2); -+ p_nextcolsum1h = _mm_shuffle_epi8(thiscolsum1h, next_index_lastcol); -+ } else { -+#ifdef NEED_ALIGN8 -+ ALIGN8_READ16(this0, src0, 1) src0_ptr += 2; -+ ALIGN8_READ16(this_1, src_1, 1) src_1_ptr += 2; -+ ALIGN8_READ16(this1, src1, 1) src1_ptr += 2; -+#else -+ this0 = VEC_LD(inptr0 + 16); inptr0 += 16; -+ this_1 = VEC_LD(inptr_1 + 16); inptr_1 += 16; -+ this1 = VEC_LD(inptr1 + 16); inptr1 += 16; -+#endif -+ this0l = _mm_unpacklo_epi8(this0, pb_zero); -+ this0h = _mm_unpackhi_epi8(this0, pb_zero); -+ this0l = _mm_mullo_epi16(this0l, pw_three); -+ this0h = _mm_mullo_epi16(this0h, pw_three); -+ -+ this_1l = _mm_unpacklo_epi8(this_1, pb_zero); -+ this_1h = _mm_unpackhi_epi8(this_1, pb_zero); -+ nextcolsum_1l = _mm_add_epi16(this0l, this_1l); -+ nextcolsum_1h = _mm_add_epi16(this0h, this_1h); -+ p_nextcolsum_1l = _mm_alignr_epi8(thiscolsum_1h, thiscolsum_1l, 2); -+ p_nextcolsum_1h = _mm_alignr_epi8(nextcolsum_1l, thiscolsum_1h, 2); -+ -+ this1l = _mm_unpacklo_epi8(this1, pb_zero); -+ this1h = _mm_unpackhi_epi8(this1, pb_zero); -+ nextcolsum1l = _mm_add_epi16(this0l, this1l); -+ nextcolsum1h = _mm_add_epi16(this0h, this1h); -+ p_nextcolsum1l = _mm_alignr_epi8(thiscolsum1h, thiscolsum1l, 2); -+ p_nextcolsum1h = _mm_alignr_epi8(nextcolsum1l, thiscolsum1h, 2); -+ } -+ -+ /* Process the upper row */ -+ -+ tmpl = _mm_mullo_epi16(thiscolsum_1l, pw_three); -+ outle = _mm_add_epi16(tmpl, p_lastcolsum_1l); -+ outle = _mm_add_epi16(outle, pw_eight); -+ outle = _mm_srli_epi16(outle, 4); -+ -+ outlo = _mm_add_epi16(tmpl, p_nextcolsum_1l); -+ outlo = _mm_add_epi16(outlo, pw_seven); -+ outlo = _mm_srli_epi16(outlo, 4); -+ -+ out = _mm_or_si128(outle, _mm_slli_epi16(outlo, 8)); -+ VEC_ST(outptr0, out); -+ -+ /* Process the lower row */ -+ -+ tmpl = _mm_mullo_epi16(thiscolsum1l, pw_three); -+ outle = _mm_add_epi16(tmpl, p_lastcolsum1l); -+ outle = _mm_add_epi16(outle, pw_eight); -+ outle = _mm_srli_epi16(outle, 4); -+ -+ outlo = _mm_add_epi16(tmpl, p_nextcolsum1l); -+ outlo = _mm_add_epi16(outlo, pw_seven); -+ outlo = _mm_srli_epi16(outlo, 4); -+ -+ out = _mm_or_si128(outle, _mm_slli_epi16(outlo, 8)); -+ VEC_ST(outptr1, out); -+ -+ if (__builtin_expect(incol <= 8, 0)) break; -+ -+ tmph = _mm_mullo_epi16(thiscolsum_1h, pw_three); -+ outhe = _mm_add_epi16(tmph, p_lastcolsum_1h); -+ outhe = _mm_add_epi16(outhe, pw_eight); -+ outhe = _mm_srli_epi16(outhe, 4); -+ -+ outho = _mm_add_epi16(tmph, p_nextcolsum_1h); -+ outho = _mm_add_epi16(outho, pw_seven); -+ outho = _mm_srli_epi16(outho, 4); -+ -+ out = _mm_or_si128(outhe, _mm_slli_epi16(outho, 8)); -+ VEC_ST(outptr0 + 16, out); -+ -+ tmph = _mm_mullo_epi16(thiscolsum1h, pw_three); -+ outhe = _mm_add_epi16(tmph, p_lastcolsum1h); -+ outhe = _mm_add_epi16(outhe, pw_eight); -+ outhe = _mm_srli_epi16(outhe, 4); -+ -+ outho = _mm_add_epi16(tmph, p_nextcolsum1h); -+ outho = _mm_add_epi16(outho, pw_seven); -+ outho = _mm_srli_epi16(outho, 4); -+ -+ out = _mm_or_si128(outhe, _mm_slli_epi16(outho, 8)); -+ VEC_ST(outptr1 + 16, out); -+ -+ thiscolsum_1l = nextcolsum_1l; thiscolsum_1h = nextcolsum_1h; -+ thiscolsum1l = nextcolsum1l; thiscolsum1h = nextcolsum1h; -+ } -+ } -+} -+ -+ -+/* These are rarely used (mainly just for decompressing YCCK images) */ -+ -+void jsimd_h2v1_upsample_e2k(int max_v_samp_factor, -+ JDIMENSION out_width, -+ JSAMPARRAY input_data, -+ JSAMPARRAY *output_data_ptr) -+{ -+ JSAMPARRAY output_data = *output_data_ptr; -+ JSAMPROW inptr, outptr; -+ int inrow, incol; -+ -+ __m128i in, inl, inh; -+#ifdef NEED_ALIGN8 -+ ALIGN8_COMMON -+ ALIGN8_VARS(src) -+#endif -+ -+ if (out_width > 0) -+ for (inrow = 0; inrow < max_v_samp_factor; inrow++) { -+ inptr = input_data[inrow]; -+ outptr = output_data[inrow]; -+#ifdef NEED_ALIGN8 -+ ALIGN8_START(inptr, src) -+#endif -+ PRAGMA_E2K("ivdep") -+ for (incol = out_width; incol > 0; -+ incol -= 32, outptr += 32) { -+#ifdef NEED_ALIGN8 -+ ALIGN8_READ16(in, src, 0) src_ptr += 2; -+#else -+ in = VEC_LD(inptr); inptr += 16; -+#endif -+ inl = _mm_unpacklo_epi8(in, in); -+ inh = _mm_unpackhi_epi8(in, in); -+ -+ VEC_ST(outptr, inl); -+ VEC_ST(outptr + 16, inh); -+ } -+ } -+} -+ -+ -+void jsimd_h2v2_upsample_e2k(int max_v_samp_factor, -+ JDIMENSION out_width, -+ JSAMPARRAY input_data, -+ JSAMPARRAY *output_data_ptr) -+{ -+ JSAMPARRAY output_data = *output_data_ptr; -+ JSAMPROW inptr, outptr0, outptr1; -+ int inrow, outrow, incol; -+ -+ __m128i in, inl, inh; -+#ifdef NEED_ALIGN8 -+ ALIGN8_COMMON -+ ALIGN8_VARS(src) -+#endif -+ -+ if (out_width > 0) -+ for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) { -+ inptr = input_data[inrow]; -+ outptr0 = output_data[outrow++]; -+ outptr1 = output_data[outrow++]; -+#ifdef NEED_ALIGN8 -+ ALIGN8_START(inptr, src) -+#endif -+ PRAGMA_E2K("ivdep") -+ for (incol = out_width; incol > 0; -+ incol -= 32, outptr0 += 32, outptr1 += 32) { -+#ifdef NEED_ALIGN8 -+ ALIGN8_READ16(in, src, 0) src_ptr += 2; -+#else -+ in = VEC_LD(inptr); inptr += 16; -+#endif -+ inl = _mm_unpacklo_epi8(in, in); -+ inh = _mm_unpackhi_epi8(in, in); -+ -+ VEC_ST(outptr0, inl); -+ VEC_ST(outptr1, inl); -+ VEC_ST(outptr0 + 16, inh); -+ VEC_ST(outptr1 + 16, inh); -+ } -+ } -+} -diff --git a/simd/e2k/jfdctflt-e2k.c b/simd/e2k/jfdctflt-e2k.c -new file mode 100644 -index 0000000..e3c4d94 ---- /dev/null -+++ b/simd/e2k/jfdctflt-e2k.c -@@ -0,0 +1,127 @@ -+/* -+ * Elbrus optimizations for libjpeg-turbo -+ * -+ * Copyright (C) 2014, D. R. Commander. All Rights Reserved. -+ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd -+ * -+ * This software is provided 'as-is', without any express or implied -+ * warranty. In no event will the authors be held liable for any damages -+ * arising from the use of this software. -+ * -+ * Permission is granted to anyone to use this software for any purpose, -+ * including commercial applications, and to alter it and redistribute it -+ * freely, subject to the following restrictions: -+ * -+ * 1. The origin of this software must not be misrepresented; you must not -+ * claim that you wrote the original software. If you use this software -+ * in a product, an acknowledgment in the product documentation would be -+ * appreciated but is not required. -+ * 2. Altered source versions must be plainly marked as such, and must not be -+ * misrepresented as being the original software. -+ * 3. This notice may not be removed or altered from any source distribution. -+ */ -+ -+/* FLOAT FORWARD DCT */ -+ -+#include "jsimd_e2k.h" -+ -+#define DO_FDCT(in, out) { \ -+ tmp0 = _mm_add_ps(in##0, in##7); \ -+ tmp7 = _mm_sub_ps(in##0, in##7); \ -+ tmp1 = _mm_add_ps(in##1, in##6); \ -+ tmp6 = _mm_sub_ps(in##1, in##6); \ -+ tmp2 = _mm_add_ps(in##2, in##5); \ -+ tmp5 = _mm_sub_ps(in##2, in##5); \ -+ tmp3 = _mm_add_ps(in##3, in##4); \ -+ tmp4 = _mm_sub_ps(in##3, in##4); \ -+ \ -+ /* Even part */ \ -+ \ -+ tmp10 = _mm_add_ps(tmp0, tmp3); \ -+ tmp13 = _mm_sub_ps(tmp0, tmp3); \ -+ tmp11 = _mm_add_ps(tmp1, tmp2); \ -+ tmp12 = _mm_sub_ps(tmp1, tmp2); \ -+ \ -+ out##0 = _mm_add_ps(tmp10, tmp11); \ -+ out##4 = _mm_sub_ps(tmp10, tmp11); \ -+ \ -+ z1 = _mm_mul_ps(_mm_add_ps(tmp12, tmp13), pd_f0707); \ -+ out##2 = _mm_add_ps(tmp13, z1); \ -+ out##6 = _mm_sub_ps(tmp13, z1); \ -+ \ -+ /* Odd part */ \ -+ \ -+ tmp10 = _mm_add_ps(tmp4, tmp5); \ -+ tmp11 = _mm_add_ps(tmp5, tmp6); \ -+ tmp12 = _mm_add_ps(tmp6, tmp7); \ -+ \ -+ z5 = _mm_mul_ps(_mm_sub_ps(tmp10, tmp12), pd_f0382); \ -+ z2 = _mm_add_ps(_mm_mul_ps(tmp10, pd_f0541), z5); \ -+ z4 = _mm_add_ps(_mm_mul_ps(tmp12, pd_f1306), z5); \ -+ z3 = _mm_mul_ps(tmp11, pd_f0707); \ -+ \ -+ z11 = _mm_add_ps(tmp7, z3); \ -+ z13 = _mm_sub_ps(tmp7, z3); \ -+ \ -+ out##5 = _mm_add_ps(z13, z2); \ -+ out##3 = _mm_sub_ps(z13, z2); \ -+ out##1 = _mm_add_ps(z11, z4); \ -+ out##7 = _mm_sub_ps(z11, z4); \ -+} -+ -+#define LOAD_DATA(a, b, c, d, l, i) \ -+ l##a = _mm_loadu_ps(data + a * 8 + i); \ -+ l##b = _mm_loadu_ps(data + b * 8 + i); \ -+ l##c = _mm_loadu_ps(data + c * 8 + i); \ -+ l##d = _mm_loadu_ps(data + d * 8 + i); -+ -+#define STORE_DATA(a, b, c, d, l, i) \ -+ _mm_storeu_ps(data + a * 8 + i, l##a); \ -+ _mm_storeu_ps(data + b * 8 + i, l##b); \ -+ _mm_storeu_ps(data + c * 8 + i, l##c); \ -+ _mm_storeu_ps(data + d * 8 + i, l##d); -+ -+ -+void jsimd_fdct_float_e2k(FAST_FLOAT *data) -+{ -+ __m128 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, -+ tmp10, tmp11, tmp12, tmp13, z1, z2, z3, z4, z5, z11, z13; -+ __m128 l0, l1, l2, l3, l4, l5, l6, l7; -+ __m128 h0, h1, h2, h3, h4, h5, h6, h7; -+ __m128 x0, x1, x2, x3, x4, x5, x6, x7; -+ __m128 y0, y1, y2, y3, y4, y5, y6, y7; -+ -+ /* Constants */ -+ __m128 pd_f0382 = _mm_set1_ps(0.382683433f), -+ pd_f0541 = _mm_set1_ps(0.541196100f), -+ pd_f0707 = _mm_set1_ps(0.707106781f), -+ pd_f1306 = _mm_set1_ps(1.306562965f); -+ -+ /* Pass 1: process columns */ -+ -+ LOAD_DATA(0, 1, 2, 3, x, 0) -+ LOAD_DATA(0, 1, 2, 3, y, 4) -+ TRANSPOSE_FLOAT(x0, x1, x2, x3, l0, l1, l2, l3) -+ TRANSPOSE_FLOAT(y0, y1, y2, y3, l4, l5, l6, l7) -+ DO_FDCT(l, l); -+ -+ LOAD_DATA(4, 5, 6, 7, x, 0) -+ LOAD_DATA(4, 5, 6, 7, y, 4) -+ TRANSPOSE_FLOAT(x4, x5, x6, x7, h0, h1, h2, h3) -+ TRANSPOSE_FLOAT(y4, y5, y6, y7, h4, h5, h6, h7) -+ DO_FDCT(h, h); -+ -+ /* Pass 2: process rows */ -+ -+ TRANSPOSE_FLOAT(l0, l1, l2, l3, x0, x1, x2, x3) -+ TRANSPOSE_FLOAT(h0, h1, h2, h3, x4, x5, x6, x7) -+ DO_FDCT(x, x); -+ STORE_DATA(0, 1, 2, 3, x, 0) -+ STORE_DATA(4, 5, 6, 7, x, 0) -+ -+ TRANSPOSE_FLOAT(l4, l5, l6, l7, y0, y1, y2, y3) -+ TRANSPOSE_FLOAT(h4, h5, h6, h7, y4, y5, y6, y7) -+ DO_FDCT(y, y); -+ STORE_DATA(0, 1, 2, 3, y, 4) -+ STORE_DATA(4, 5, 6, 7, y, 4) -+} -diff --git a/simd/e2k/jfdctfst-e2k.c b/simd/e2k/jfdctfst-e2k.c -new file mode 100644 -index 0000000..9e58f05 ---- /dev/null -+++ b/simd/e2k/jfdctfst-e2k.c -@@ -0,0 +1,145 @@ -+/* -+ * Elbrus optimizations for libjpeg-turbo -+ * -+ * Copyright (C) 2014, D. R. Commander. All Rights Reserved. -+ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd -+ * -+ * This software is provided 'as-is', without any express or implied -+ * warranty. In no event will the authors be held liable for any damages -+ * arising from the use of this software. -+ * -+ * Permission is granted to anyone to use this software for any purpose, -+ * including commercial applications, and to alter it and redistribute it -+ * freely, subject to the following restrictions: -+ * -+ * 1. The origin of this software must not be misrepresented; you must not -+ * claim that you wrote the original software. If you use this software -+ * in a product, an acknowledgment in the product documentation would be -+ * appreciated but is not required. -+ * 2. Altered source versions must be plainly marked as such, and must not be -+ * misrepresented as being the original software. -+ * 3. This notice may not be removed or altered from any source distribution. -+ */ -+ -+/* FAST INTEGER FORWARD DCT */ -+ -+#include "jsimd_e2k.h" -+ -+ -+#define F_0_382 98 /* FIX(0.382683433) */ -+#define F_0_541 139 /* FIX(0.541196100) */ -+#define F_0_707 181 /* FIX(0.707106781) */ -+#define F_1_306 334 /* FIX(1.306562965) */ -+ -+#define CONST_BITS 8 -+#define PRE_MULTIPLY_SCALE_BITS 2 -+#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) -+ -+ -+#define DO_FDCT() { \ -+ /* Even part */ \ -+ \ -+ tmp10 = _mm_add_epi16(tmp0, tmp3); \ -+ tmp13 = _mm_sub_epi16(tmp0, tmp3); \ -+ tmp11 = _mm_add_epi16(tmp1, tmp2); \ -+ tmp12 = _mm_sub_epi16(tmp1, tmp2); \ -+ \ -+ out0 = _mm_add_epi16(tmp10, tmp11); \ -+ out4 = _mm_sub_epi16(tmp10, tmp11); \ -+ \ -+ z1 = _mm_add_epi16(tmp12, tmp13); \ -+ z1 = _mm_slli_epi16(z1, PRE_MULTIPLY_SCALE_BITS); \ -+ z1 = _mm_mulhi_epi16(z1, pw_0707); \ -+ \ -+ out2 = _mm_add_epi16(tmp13, z1); \ -+ out6 = _mm_sub_epi16(tmp13, z1); \ -+ \ -+ /* Odd part */ \ -+ \ -+ tmp10 = _mm_add_epi16(tmp4, tmp5); \ -+ tmp11 = _mm_add_epi16(tmp5, tmp6); \ -+ tmp12 = _mm_add_epi16(tmp6, tmp7); \ -+ \ -+ tmp10 = _mm_slli_epi16(tmp10, PRE_MULTIPLY_SCALE_BITS); \ -+ tmp12 = _mm_slli_epi16(tmp12, PRE_MULTIPLY_SCALE_BITS); \ -+ z5 = _mm_sub_epi16(tmp10, tmp12); \ -+ z5 = _mm_mulhi_epi16(z5, pw_0382); \ -+ \ -+ z2 = _mm_add_epi16(_mm_mulhi_epi16(tmp10, pw_0541), z5); \ -+ z4 = _mm_add_epi16(_mm_mulhi_epi16(tmp12, pw_1306), z5); \ -+ \ -+ tmp11 = _mm_slli_epi16(tmp11, PRE_MULTIPLY_SCALE_BITS); \ -+ z3 = _mm_mulhi_epi16(tmp11, pw_0707); \ -+ \ -+ z11 = _mm_add_epi16(tmp7, z3); \ -+ z13 = _mm_sub_epi16(tmp7, z3); \ -+ \ -+ out5 = _mm_add_epi16(z13, z2); \ -+ out3 = _mm_sub_epi16(z13, z2); \ -+ out1 = _mm_add_epi16(z11, z4); \ -+ out7 = _mm_sub_epi16(z11, z4); \ -+} -+ -+ -+void jsimd_fdct_ifast_e2k(DCTELEM *data) -+{ -+ __m128i row0, row1, row2, row3, row4, row5, row6, row7, -+ col0, col1, col2, col3, col4, col5, col6, col7, -+ tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13, -+ z1, z2, z3, z4, z5, z11, z13, -+ out0, out1, out2, out3, out4, out5, out6, out7; -+ -+ /* Constants */ -+ __m128i pw_0382 = _mm_set1_epi16(F_0_382 << CONST_SHIFT), -+ pw_0541 = _mm_set1_epi16(F_0_541 << CONST_SHIFT), -+ pw_0707 = _mm_set1_epi16(F_0_707 << CONST_SHIFT), -+ pw_1306 = _mm_set1_epi16(F_1_306 << CONST_SHIFT); -+ -+ /* Pass 1: process rows */ -+ -+ row0 = VEC_LD(data + 0 * 8); -+ row1 = VEC_LD(data + 1 * 8); -+ row2 = VEC_LD(data + 2 * 8); -+ row3 = VEC_LD(data + 3 * 8); -+ row4 = VEC_LD(data + 4 * 8); -+ row5 = VEC_LD(data + 5 * 8); -+ row6 = VEC_LD(data + 6 * 8); -+ row7 = VEC_LD(data + 7 * 8); -+ -+ TRANSPOSE(row, col); -+ -+ tmp0 = _mm_add_epi16(col0, col7); -+ tmp7 = _mm_sub_epi16(col0, col7); -+ tmp1 = _mm_add_epi16(col1, col6); -+ tmp6 = _mm_sub_epi16(col1, col6); -+ tmp2 = _mm_add_epi16(col2, col5); -+ tmp5 = _mm_sub_epi16(col2, col5); -+ tmp3 = _mm_add_epi16(col3, col4); -+ tmp4 = _mm_sub_epi16(col3, col4); -+ -+ DO_FDCT(); -+ -+ /* Pass 2: process columns */ -+ -+ TRANSPOSE(out, row); -+ -+ tmp0 = _mm_add_epi16(row0, row7); -+ tmp7 = _mm_sub_epi16(row0, row7); -+ tmp1 = _mm_add_epi16(row1, row6); -+ tmp6 = _mm_sub_epi16(row1, row6); -+ tmp2 = _mm_add_epi16(row2, row5); -+ tmp5 = _mm_sub_epi16(row2, row5); -+ tmp3 = _mm_add_epi16(row3, row4); -+ tmp4 = _mm_sub_epi16(row3, row4); -+ -+ DO_FDCT(); -+ -+ VEC_ST(data + 0 * 8, out0); -+ VEC_ST(data + 1 * 8, out1); -+ VEC_ST(data + 2 * 8, out2); -+ VEC_ST(data + 3 * 8, out3); -+ VEC_ST(data + 4 * 8, out4); -+ VEC_ST(data + 5 * 8, out5); -+ VEC_ST(data + 6 * 8, out6); -+ VEC_ST(data + 7 * 8, out7); -+} -diff --git a/simd/e2k/jfdctint-e2k.c b/simd/e2k/jfdctint-e2k.c -new file mode 100644 -index 0000000..2200852 ---- /dev/null -+++ b/simd/e2k/jfdctint-e2k.c -@@ -0,0 +1,255 @@ -+/* -+ * Elbrus optimizations for libjpeg-turbo -+ * -+ * Copyright (C) 2014, 2020, D. R. Commander. All Rights Reserved. -+ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd -+ * -+ * This software is provided 'as-is', without any express or implied -+ * warranty. In no event will the authors be held liable for any damages -+ * arising from the use of this software. -+ * -+ * Permission is granted to anyone to use this software for any purpose, -+ * including commercial applications, and to alter it and redistribute it -+ * freely, subject to the following restrictions: -+ * -+ * 1. The origin of this software must not be misrepresented; you must not -+ * claim that you wrote the original software. If you use this software -+ * in a product, an acknowledgment in the product documentation would be -+ * appreciated but is not required. -+ * 2. Altered source versions must be plainly marked as such, and must not be -+ * misrepresented as being the original software. -+ * 3. This notice may not be removed or altered from any source distribution. -+ */ -+ -+/* ACCURATE INTEGER FORWARD DCT */ -+ -+#include "jsimd_e2k.h" -+ -+ -+#define F_0_298 2446 /* FIX(0.298631336) */ -+#define F_0_390 3196 /* FIX(0.390180644) */ -+#define F_0_541 4433 /* FIX(0.541196100) */ -+#define F_0_765 6270 /* FIX(0.765366865) */ -+#define F_0_899 7373 /* FIX(0.899976223) */ -+#define F_1_175 9633 /* FIX(1.175875602) */ -+#define F_1_501 12299 /* FIX(1.501321110) */ -+#define F_1_847 15137 /* FIX(1.847759065) */ -+#define F_1_961 16069 /* FIX(1.961570560) */ -+#define F_2_053 16819 /* FIX(2.053119869) */ -+#define F_2_562 20995 /* FIX(2.562915447) */ -+#define F_3_072 25172 /* FIX(3.072711026) */ -+ -+#define CONST_BITS 13 -+#define PASS1_BITS 2 -+#define DESCALE_P1 (CONST_BITS - PASS1_BITS) -+#define DESCALE_P2 (CONST_BITS + PASS1_BITS) -+ -+ -+#define DO_FDCT_COMMON(PASS) { \ -+ /* (Original) \ -+ * z1 = (tmp12 + tmp13) * 0.541196100; \ -+ * data2 = z1 + tmp13 * 0.765366865; \ -+ * data6 = z1 + tmp12 * -1.847759065; \ -+ * \ -+ * (This implementation) \ -+ * data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; \ -+ * data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); \ -+ */ \ -+ \ -+ tmp1312l = _mm_unpacklo_epi16(tmp13, tmp12); \ -+ tmp1312h = _mm_unpackhi_epi16(tmp13, tmp12); \ -+ \ -+ out2l = _mm_add_epi32(_mm_madd_epi16(tmp1312l, pw_f130_f054), pd_descale_p##PASS); \ -+ out2h = _mm_add_epi32(_mm_madd_epi16(tmp1312h, pw_f130_f054), pd_descale_p##PASS); \ -+ out6l = _mm_add_epi32(_mm_madd_epi16(tmp1312l, pw_f054_mf130), pd_descale_p##PASS); \ -+ out6h = _mm_add_epi32(_mm_madd_epi16(tmp1312h, pw_f054_mf130), pd_descale_p##PASS); \ -+ \ -+ out2l = _mm_srai_epi32(out2l, DESCALE_P##PASS); \ -+ out2h = _mm_srai_epi32(out2h, DESCALE_P##PASS); \ -+ out6l = _mm_srai_epi32(out6l, DESCALE_P##PASS); \ -+ out6h = _mm_srai_epi32(out6h, DESCALE_P##PASS); \ -+ \ -+ out2 = _mm_packs_epi32(out2l, out2h); \ -+ out6 = _mm_packs_epi32(out6l, out6h); \ -+ \ -+ /* Odd part */ \ -+ \ -+ z3 = _mm_add_epi16(tmp4, tmp6); \ -+ z4 = _mm_add_epi16(tmp5, tmp7); \ -+ \ -+ /* (Original) \ -+ * z5 = (z3 + z4) * 1.175875602; \ -+ * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \ -+ * z3 += z5; z4 += z5; \ -+ * \ -+ * (This implementation) \ -+ * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \ -+ * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \ -+ */ \ -+ \ -+ z34l = _mm_unpacklo_epi16(z3, z4); \ -+ z34h = _mm_unpackhi_epi16(z3, z4); \ -+ \ -+ z3l = _mm_add_epi32(_mm_madd_epi16(z34l, pw_mf078_f117), pd_descale_p##PASS); \ -+ z3h = _mm_add_epi32(_mm_madd_epi16(z34h, pw_mf078_f117), pd_descale_p##PASS); \ -+ z4l = _mm_add_epi32(_mm_madd_epi16(z34l, pw_f117_f078), pd_descale_p##PASS); \ -+ z4h = _mm_add_epi32(_mm_madd_epi16(z34h, pw_f117_f078), pd_descale_p##PASS); \ -+ \ -+ /* (Original) \ -+ * z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; \ -+ * tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; \ -+ * tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; \ -+ * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \ -+ * data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; \ -+ * data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; \ -+ * \ -+ * (This implementation) \ -+ * tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; \ -+ * tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; \ -+ * tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); \ -+ * tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); \ -+ * data7 = tmp4 + z3; data5 = tmp5 + z4; \ -+ * data3 = tmp6 + z3; data1 = tmp7 + z4; \ -+ */ \ -+ \ -+ tmp47l = _mm_unpacklo_epi16(tmp4, tmp7); \ -+ tmp47h = _mm_unpackhi_epi16(tmp4, tmp7); \ -+ \ -+ out7l = _mm_add_epi32(_mm_madd_epi16(tmp47l, pw_mf060_mf089), z3l); \ -+ out7h = _mm_add_epi32(_mm_madd_epi16(tmp47h, pw_mf060_mf089), z3h); \ -+ out1l = _mm_add_epi32(_mm_madd_epi16(tmp47l, pw_mf089_f060), z4l); \ -+ out1h = _mm_add_epi32(_mm_madd_epi16(tmp47h, pw_mf089_f060), z4h); \ -+ \ -+ out7l = _mm_srai_epi32(out7l, DESCALE_P##PASS); \ -+ out7h = _mm_srai_epi32(out7h, DESCALE_P##PASS); \ -+ out1l = _mm_srai_epi32(out1l, DESCALE_P##PASS); \ -+ out1h = _mm_srai_epi32(out1h, DESCALE_P##PASS); \ -+ \ -+ out7 = _mm_packs_epi32(out7l, out7h); \ -+ out1 = _mm_packs_epi32(out1l, out1h); \ -+ \ -+ tmp56l = _mm_unpacklo_epi16(tmp5, tmp6); \ -+ tmp56h = _mm_unpackhi_epi16(tmp5, tmp6); \ -+ \ -+ out5l = _mm_add_epi32(_mm_madd_epi16(tmp56l, pw_mf050_mf256), z4l); \ -+ out5h = _mm_add_epi32(_mm_madd_epi16(tmp56h, pw_mf050_mf256), z4h); \ -+ out3l = _mm_add_epi32(_mm_madd_epi16(tmp56l, pw_mf256_f050), z3l); \ -+ out3h = _mm_add_epi32(_mm_madd_epi16(tmp56h, pw_mf256_f050), z3h); \ -+ \ -+ out5l = _mm_srai_epi32(out5l, DESCALE_P##PASS); \ -+ out5h = _mm_srai_epi32(out5h, DESCALE_P##PASS); \ -+ out3l = _mm_srai_epi32(out3l, DESCALE_P##PASS); \ -+ out3h = _mm_srai_epi32(out3h, DESCALE_P##PASS); \ -+ \ -+ out5 = _mm_packs_epi32(out5l, out5h); \ -+ out3 = _mm_packs_epi32(out3l, out3h); \ -+} -+ -+#define DO_FDCT_PASS1() { \ -+ /* Even part */ \ -+ \ -+ tmp10 = _mm_add_epi16(tmp0, tmp3); \ -+ tmp13 = _mm_sub_epi16(tmp0, tmp3); \ -+ tmp11 = _mm_add_epi16(tmp1, tmp2); \ -+ tmp12 = _mm_sub_epi16(tmp1, tmp2); \ -+ \ -+ out0 = _mm_add_epi16(tmp10, tmp11); \ -+ out0 = _mm_slli_epi16(out0, PASS1_BITS); \ -+ out4 = _mm_sub_epi16(tmp10, tmp11); \ -+ out4 = _mm_slli_epi16(out4, PASS1_BITS); \ -+ \ -+ DO_FDCT_COMMON(1); \ -+} -+ -+#define DO_FDCT_PASS2() { \ -+ /* Even part */ \ -+ \ -+ tmp10 = _mm_add_epi16(tmp0, tmp3); \ -+ tmp13 = _mm_sub_epi16(tmp0, tmp3); \ -+ tmp11 = _mm_add_epi16(tmp1, tmp2); \ -+ tmp12 = _mm_sub_epi16(tmp1, tmp2); \ -+ \ -+ out0 = _mm_add_epi16(tmp10, tmp11); \ -+ out0 = _mm_add_epi16(out0, pw_descale_p2x); \ -+ out0 = _mm_srai_epi16(out0, PASS1_BITS); \ -+ out4 = _mm_sub_epi16(tmp10, tmp11); \ -+ out4 = _mm_add_epi16(out4, pw_descale_p2x); \ -+ out4 = _mm_srai_epi16(out4, PASS1_BITS); \ -+ \ -+ DO_FDCT_COMMON(2); \ -+} -+ -+ -+void jsimd_fdct_islow_e2k(DCTELEM *data) -+{ -+ __m128i row0, row1, row2, row3, row4, row5, row6, row7, -+ col0, col1, col2, col3, col4, col5, col6, col7, -+ tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13, -+ tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h, -+ z3, z4, z34l, z34h, -+ out0, out1, out2, out3, out4, out5, out6, out7; -+ __m128i z3l, z3h, z4l, z4h, -+ out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h, -+ out7l, out7h; -+ -+ /* Constants */ -+ __m128i pw_f130_f054 = _mm_setr_epi16(__4X2(F_0_541 + F_0_765, F_0_541)), -+ pw_f054_mf130 = _mm_setr_epi16(__4X2(F_0_541, F_0_541 - F_1_847)), -+ pw_mf078_f117 = _mm_setr_epi16(__4X2(F_1_175 - F_1_961, F_1_175)), -+ pw_f117_f078 = _mm_setr_epi16(__4X2(F_1_175, F_1_175 - F_0_390)), -+ pw_mf060_mf089 = _mm_setr_epi16(__4X2(F_0_298 - F_0_899, -F_0_899)), -+ pw_mf089_f060 = _mm_setr_epi16(__4X2(-F_0_899, F_1_501 - F_0_899)), -+ pw_mf050_mf256 = _mm_setr_epi16(__4X2(F_2_053 - F_2_562, -F_2_562)), -+ pw_mf256_f050 = _mm_setr_epi16(__4X2(-F_2_562, F_3_072 - F_2_562)), -+ pw_descale_p2x = _mm_set1_epi16(1 << (PASS1_BITS - 1)), -+ pd_descale_p1 = _mm_set1_epi32(1 << (DESCALE_P1 - 1)), -+ pd_descale_p2 = _mm_set1_epi32(1 << (DESCALE_P2 - 1)); -+ -+ /* Pass 1: process rows */ -+ -+ row0 = VEC_LD(data + 0 * 8); -+ row1 = VEC_LD(data + 1 * 8); -+ row2 = VEC_LD(data + 2 * 8); -+ row3 = VEC_LD(data + 3 * 8); -+ row4 = VEC_LD(data + 4 * 8); -+ row5 = VEC_LD(data + 5 * 8); -+ row6 = VEC_LD(data + 6 * 8); -+ row7 = VEC_LD(data + 7 * 8); -+ -+ TRANSPOSE(row, col); -+ -+ tmp0 = _mm_add_epi16(col0, col7); -+ tmp7 = _mm_sub_epi16(col0, col7); -+ tmp1 = _mm_add_epi16(col1, col6); -+ tmp6 = _mm_sub_epi16(col1, col6); -+ tmp2 = _mm_add_epi16(col2, col5); -+ tmp5 = _mm_sub_epi16(col2, col5); -+ tmp3 = _mm_add_epi16(col3, col4); -+ tmp4 = _mm_sub_epi16(col3, col4); -+ -+ DO_FDCT_PASS1(); -+ -+ /* Pass 2: process columns */ -+ -+ TRANSPOSE(out, row); -+ -+ tmp0 = _mm_add_epi16(row0, row7); -+ tmp7 = _mm_sub_epi16(row0, row7); -+ tmp1 = _mm_add_epi16(row1, row6); -+ tmp6 = _mm_sub_epi16(row1, row6); -+ tmp2 = _mm_add_epi16(row2, row5); -+ tmp5 = _mm_sub_epi16(row2, row5); -+ tmp3 = _mm_add_epi16(row3, row4); -+ tmp4 = _mm_sub_epi16(row3, row4); -+ -+ DO_FDCT_PASS2(); -+ -+ VEC_ST(data + 0 * 8, out0); -+ VEC_ST(data + 1 * 8, out1); -+ VEC_ST(data + 2 * 8, out2); -+ VEC_ST(data + 3 * 8, out3); -+ VEC_ST(data + 4 * 8, out4); -+ VEC_ST(data + 5 * 8, out5); -+ VEC_ST(data + 6 * 8, out6); -+ VEC_ST(data + 7 * 8, out7); -+} -diff --git a/simd/e2k/jidctflt-e2k.c b/simd/e2k/jidctflt-e2k.c -new file mode 100644 -index 0000000..7682965 ---- /dev/null -+++ b/simd/e2k/jidctflt-e2k.c -@@ -0,0 +1,215 @@ -+/* -+ * Elbrus optimizations for libjpeg-turbo -+ * -+ * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. -+ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd -+ * -+ * This software is provided 'as-is', without any express or implied -+ * warranty. In no event will the authors be held liable for any damages -+ * arising from the use of this software. -+ * -+ * Permission is granted to anyone to use this software for any purpose, -+ * including commercial applications, and to alter it and redistribute it -+ * freely, subject to the following restrictions: -+ * -+ * 1. The origin of this software must not be misrepresented; you must not -+ * claim that you wrote the original software. If you use this software -+ * in a product, an acknowledgment in the product documentation would be -+ * appreciated but is not required. -+ * 2. Altered source versions must be plainly marked as such, and must not be -+ * misrepresented as being the original software. -+ * 3. This notice may not be removed or altered from any source distribution. -+ */ -+ -+/* FLOAT INVERSE DCT */ -+ -+#include "jsimd_e2k.h" -+ -+#define DO_IDCT(in, out) { \ -+ /* Even part */ \ -+ \ -+ tmp10 = _mm_add_ps(in##0, in##4); \ -+ tmp11 = _mm_sub_ps(in##0, in##4); \ -+ \ -+ tmp13 = _mm_add_ps(in##2, in##6); \ -+ tmp12 = _mm_sub_ps(in##2, in##6); \ -+ tmp12 = _mm_sub_ps(_mm_mul_ps(tmp12, pd_f1414), tmp13); \ -+ \ -+ tmp0 = _mm_add_ps(tmp10, tmp13); \ -+ tmp3 = _mm_sub_ps(tmp10, tmp13); \ -+ tmp1 = _mm_add_ps(tmp11, tmp12); \ -+ tmp2 = _mm_sub_ps(tmp11, tmp12); \ -+ \ -+ /* Odd part */ \ -+ \ -+ z13 = _mm_add_ps(in##5, in##3); \ -+ z10 = _mm_sub_ps(in##5, in##3); \ -+ z11 = _mm_add_ps(in##1, in##7); \ -+ z12 = _mm_sub_ps(in##1, in##7); \ -+ \ -+ tmp7 = _mm_add_ps(z11, z13); \ -+ tmp11 = _mm_sub_ps(z11, z13); \ -+ tmp11 = _mm_mul_ps(tmp11, pd_f1414); \ -+ \ -+ z5 = _mm_mul_ps(_mm_add_ps(z10, z12), pd_f1847); \ -+ tmp10 = _mm_sub_ps(z5, _mm_mul_ps(z12, pd_f1082)); \ -+ tmp12 = _mm_sub_ps(z5, _mm_mul_ps(z10, pd_f2613)); \ -+ \ -+ tmp6 = _mm_sub_ps(tmp12, tmp7); \ -+ tmp5 = _mm_sub_ps(tmp11, tmp6); \ -+ tmp4 = _mm_sub_ps(tmp10, tmp5); \ -+ \ -+ out##0 = _mm_add_ps(tmp0, tmp7); \ -+ out##7 = _mm_sub_ps(tmp0, tmp7); \ -+ out##1 = _mm_add_ps(tmp1, tmp6); \ -+ out##6 = _mm_sub_ps(tmp1, tmp6); \ -+ out##2 = _mm_add_ps(tmp2, tmp5); \ -+ out##5 = _mm_sub_ps(tmp2, tmp5); \ -+ out##3 = _mm_add_ps(tmp3, tmp4); \ -+ out##4 = _mm_sub_ps(tmp3, tmp4); \ -+} -+ -+#define QUANT_MUL(a, b, c, d, l, lo, i) \ -+ out0 = _mm_srai_epi32(_mm_unpack##lo##_epi16(col##a, col##a), 16); \ -+ out1 = _mm_srai_epi32(_mm_unpack##lo##_epi16(col##b, col##b), 16); \ -+ out2 = _mm_srai_epi32(_mm_unpack##lo##_epi16(col##c, col##c), 16); \ -+ out3 = _mm_srai_epi32(_mm_unpack##lo##_epi16(col##d, col##d), 16); \ -+ l##a = _mm_cvtepi32_ps(out0); \ -+ l##b = _mm_cvtepi32_ps(out1); \ -+ l##c = _mm_cvtepi32_ps(out2); \ -+ l##d = _mm_cvtepi32_ps(out3); \ -+ l##a = _mm_mul_ps(l##a, _mm_load_ps(dct_table + a * 8 + i)); \ -+ l##b = _mm_mul_ps(l##b, _mm_load_ps(dct_table + b * 8 + i)); \ -+ l##c = _mm_mul_ps(l##c, _mm_load_ps(dct_table + c * 8 + i)); \ -+ l##d = _mm_mul_ps(l##d, _mm_load_ps(dct_table + d * 8 + i)); -+ -+ -+void jsimd_idct_float_e2k(void *dct_table_, JCOEFPTR coef_block, -+ JSAMPARRAY output_buf, JDIMENSION output_col) -+{ -+ float *dct_table = (float *)dct_table_; -+ -+ __m128i col0, col1, col2, col3, col4, col5, col6, col7, -+ out0, out1, out2, out3, out4, out5, out6, out7, row0, row1, row2, row3; -+ __m128 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, -+ tmp10, tmp11, tmp12, tmp13, z5, z10, z11, z12, z13; -+ __m128 l0, l1, l2, l3, l4, l5, l6, l7; -+ __m128 h0, h1, h2, h3, h4, h5, h6, h7; -+ __m128 x0, x1, x2, x3, x4, x5, x6, x7; -+ __m128 y0, y1, y2, y3, y4, y5, y6, y7; -+ -+ /* Constants */ -+ __m128 pd_f1414 = _mm_set1_ps(1.414213562f), -+ pd_f1847 = _mm_set1_ps(1.847759065f), -+ pd_f1082 = _mm_set1_ps(1.082392200f), -+ pd_f2613 = _mm_set1_ps(2.613125930f); -+ -+ /* Pass 1: process columns */ -+ -+ col0 = VEC_LD(coef_block + 0 * 8); -+ col1 = VEC_LD(coef_block + 1 * 8); -+ col2 = VEC_LD(coef_block + 2 * 8); -+ col3 = VEC_LD(coef_block + 3 * 8); -+ col4 = VEC_LD(coef_block + 4 * 8); -+ col5 = VEC_LD(coef_block + 5 * 8); -+ col6 = VEC_LD(coef_block + 6 * 8); -+ col7 = VEC_LD(coef_block + 7 * 8); -+ -+ out1 = _mm_or_si128(col1, col2); -+ out2 = _mm_or_si128(col3, col4); -+ out1 = _mm_or_si128(out1, out2); -+ out3 = _mm_or_si128(col5, col6); -+ out3 = _mm_or_si128(out3, col7); -+ out1 = _mm_or_si128(out1, out3); -+ -+ if (VEC_ISZERO(out1)) { -+ /* AC terms all zero */ -+ -+ out0 = _mm_srai_epi32(_mm_unpacklo_epi16(col0, col0), 16); -+ out1 = _mm_srai_epi32(_mm_unpackhi_epi16(col0, col0), 16); -+ tmp0 = _mm_cvtepi32_ps(out0); -+ tmp1 = _mm_cvtepi32_ps(out1); -+ tmp0 = _mm_mul_ps(tmp0, _mm_load_ps(dct_table)); -+ tmp1 = _mm_mul_ps(tmp1, _mm_load_ps(dct_table + 4)); -+ -+ l0 = h0 = _mm_shuffle_ps(tmp0, tmp0, 0x00); -+ l1 = h1 = _mm_shuffle_ps(tmp0, tmp0, 0x55); -+ l2 = h2 = _mm_shuffle_ps(tmp0, tmp0, 0xaa); -+ l3 = h3 = _mm_shuffle_ps(tmp0, tmp0, 0xff); -+ l4 = h4 = _mm_shuffle_ps(tmp1, tmp1, 0x00); -+ l5 = h5 = _mm_shuffle_ps(tmp1, tmp1, 0x55); -+ l6 = h6 = _mm_shuffle_ps(tmp1, tmp1, 0xaa); -+ l7 = h7 = _mm_shuffle_ps(tmp1, tmp1, 0xff); -+ -+ } else { -+ -+ QUANT_MUL(0, 2, 4, 6, l, lo, 0) -+ QUANT_MUL(1, 3, 5, 7, l, lo, 0) -+ DO_IDCT(l, x); -+ -+ QUANT_MUL(0, 2, 4, 6, h, hi, 4) -+ QUANT_MUL(1, 3, 5, 7, h, hi, 4) -+ DO_IDCT(h, y); -+ -+ TRANSPOSE_FLOAT(x0, x1, x2, x3, l0, l1, l2, l3) -+ TRANSPOSE_FLOAT(x4, x5, x6, x7, h0, h1, h2, h3) -+ TRANSPOSE_FLOAT(y0, y1, y2, y3, l4, l5, l6, l7) -+ TRANSPOSE_FLOAT(y4, y5, y6, y7, h4, h5, h6, h7) -+ } -+ -+ /* Pass 2: process rows */ -+ -+ DO_IDCT(l, x); -+ DO_IDCT(h, y); -+ -+#ifdef JSIMD_SAME_ROUNDING -+#define OUT_ROUND(i) \ -+ tmp0 = _mm_add_ps(_mm_mul_ps(x##i, pd_f0125), pd_cj_rnd); \ -+ tmp1 = _mm_add_ps(_mm_mul_ps(y##i, pd_f0125), pd_cj_rnd); \ -+ out##i = _mm_packs_epi32(_mm_cvttps_epi32(tmp0), _mm_cvttps_epi32(tmp1)); -+ -+ { -+ __m128 pd_cj_rnd = _mm_set1_ps(0.5f + CENTERJSAMPLE), -+ pd_f0125 = _mm_set1_ps(0.125f); -+ -+ OUT_ROUND(0) OUT_ROUND(1) -+ OUT_ROUND(2) OUT_ROUND(3) -+ OUT_ROUND(4) OUT_ROUND(5) -+ OUT_ROUND(6) OUT_ROUND(7) -+ } -+ row0 = _mm_packus_epi16(out0, out1); -+ row1 = _mm_packus_epi16(out2, out3); -+ row2 = _mm_packus_epi16(out4, out5); -+ row3 = _mm_packus_epi16(out6, out7); -+ -+ TRANSPOSE8(row, col) TRANSPOSE8(col, row) TRANSPOSE8(row, col) -+#else /* faster, slightly differ in rounding */ -+#define OUT_ROUND(i, a, b) out##i = _mm_blendv_epi8( \ -+ _mm_slli_epi32(_mm_castps_si128(_mm_add_ps(b, pd_round)), 16), \ -+ _mm_castps_si128(_mm_add_ps(a, pd_round)), pd_mask); -+ -+ { -+ __m128i pd_mask = _mm_set1_epi32(0xffff); -+ __m128 pd_round = _mm_set1_ps((3 << 22 | CENTERJSAMPLE) * 8); -+ -+ OUT_ROUND(0, x0, x4) OUT_ROUND(1, y0, y4) -+ OUT_ROUND(2, x1, x5) OUT_ROUND(3, y1, y5) -+ OUT_ROUND(4, x2, x6) OUT_ROUND(5, y2, y6) -+ OUT_ROUND(6, x3, x7) OUT_ROUND(7, y3, y7) -+ } -+ row0 = _mm_packus_epi16(out0, out1); -+ row1 = _mm_packus_epi16(out2, out3); -+ row2 = _mm_packus_epi16(out4, out5); -+ row3 = _mm_packus_epi16(out6, out7); -+ -+ TRANSPOSE8(row, out) TRANSPOSE8(out, col) -+#endif -+ VEC_STL(output_buf[0] + output_col, col0); -+ VEC_STH(output_buf[1] + output_col, col0); -+ VEC_STL(output_buf[2] + output_col, col1); -+ VEC_STH(output_buf[3] + output_col, col1); -+ VEC_STL(output_buf[4] + output_col, col2); -+ VEC_STH(output_buf[5] + output_col, col2); -+ VEC_STL(output_buf[6] + output_col, col3); -+ VEC_STH(output_buf[7] + output_col, col3); -+} -diff --git a/simd/e2k/jidctfst-e2k.c b/simd/e2k/jidctfst-e2k.c -new file mode 100644 -index 0000000..18bc425 ---- /dev/null -+++ b/simd/e2k/jidctfst-e2k.c -@@ -0,0 +1,187 @@ -+/* -+ * Elbrus optimizations for libjpeg-turbo -+ * -+ * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. -+ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd -+ * -+ * This software is provided 'as-is', without any express or implied -+ * warranty. In no event will the authors be held liable for any damages -+ * arising from the use of this software. -+ * -+ * Permission is granted to anyone to use this software for any purpose, -+ * including commercial applications, and to alter it and redistribute it -+ * freely, subject to the following restrictions: -+ * -+ * 1. The origin of this software must not be misrepresented; you must not -+ * claim that you wrote the original software. If you use this software -+ * in a product, an acknowledgment in the product documentation would be -+ * appreciated but is not required. -+ * 2. Altered source versions must be plainly marked as such, and must not be -+ * misrepresented as being the original software. -+ * 3. This notice may not be removed or altered from any source distribution. -+ */ -+ -+/* FAST INTEGER INVERSE DCT */ -+ -+#include "jsimd_e2k.h" -+ -+ -+#define F_1_082 277 /* FIX(1.082392200) */ -+#define F_1_414 362 /* FIX(1.414213562) */ -+#define F_1_847 473 /* FIX(1.847759065) */ -+#define F_2_613 669 /* FIX(2.613125930) */ -+#define F_1_613 (F_2_613 - 256) /* FIX(2.613125930) - FIX(1) */ -+ -+#define CONST_BITS 8 -+#define PASS1_BITS 2 -+#define PRE_MULTIPLY_SCALE_BITS 2 -+#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) -+ -+ -+#define DO_IDCT(in) { \ -+ /* Even part */ \ -+ \ -+ tmp10 = _mm_add_epi16(in##0, in##4); \ -+ tmp11 = _mm_sub_epi16(in##0, in##4); \ -+ tmp13 = _mm_add_epi16(in##2, in##6); \ -+ \ -+ tmp12 = _mm_sub_epi16(in##2, in##6); \ -+ tmp12 = _mm_slli_epi16(tmp12, PRE_MULTIPLY_SCALE_BITS); \ -+ tmp12 = _mm_mulhi_epi16(tmp12, pw_F1414); \ -+ tmp12 = _mm_sub_epi16(tmp12, tmp13); \ -+ \ -+ tmp0 = _mm_add_epi16(tmp10, tmp13); \ -+ tmp3 = _mm_sub_epi16(tmp10, tmp13); \ -+ tmp1 = _mm_add_epi16(tmp11, tmp12); \ -+ tmp2 = _mm_sub_epi16(tmp11, tmp12); \ -+ \ -+ /* Odd part */ \ -+ \ -+ z13 = _mm_add_epi16(in##5, in##3); \ -+ z10 = _mm_sub_epi16(in##5, in##3); \ -+ z10s = _mm_slli_epi16(z10, PRE_MULTIPLY_SCALE_BITS); \ -+ z11 = _mm_add_epi16(in##1, in##7); \ -+ z12s = _mm_sub_epi16(in##1, in##7); \ -+ z12s = _mm_slli_epi16(z12s, PRE_MULTIPLY_SCALE_BITS); \ -+ \ -+ tmp11 = _mm_sub_epi16(z11, z13); \ -+ tmp11 = _mm_slli_epi16(tmp11, PRE_MULTIPLY_SCALE_BITS); \ -+ tmp11 = _mm_mulhi_epi16(tmp11, pw_F1414); \ -+ \ -+ tmp7 = _mm_add_epi16(z11, z13); \ -+ \ -+ /* To avoid overflow... \ -+ * \ -+ * (Original) \ -+ * tmp12 = -2.613125930 * z10 + z5; \ -+ * \ -+ * (This implementation) \ -+ * tmp12 = (-1.613125930 - 1) * z10 + z5; \ -+ * = -1.613125930 * z10 - z10 + z5; \ -+ */ \ -+ \ -+ z5 = _mm_add_epi16(z10s, z12s); \ -+ z5 = _mm_mulhi_epi16(z5, pw_F1847); \ -+ \ -+ tmp10 = _mm_mulhi_epi16(z12s, pw_F1082); \ -+ tmp10 = _mm_sub_epi16(tmp10, z5); \ -+ tmp12 = _mm_add_epi16(_mm_mulhi_epi16(z10s, pw_MF1613), z5); \ -+ tmp12 = _mm_sub_epi16(tmp12, z10); \ -+ \ -+ tmp6 = _mm_sub_epi16(tmp12, tmp7); \ -+ tmp5 = _mm_sub_epi16(tmp11, tmp6); \ -+ tmp4 = _mm_add_epi16(tmp10, tmp5); \ -+ \ -+ out0 = _mm_add_epi16(tmp0, tmp7); \ -+ out1 = _mm_add_epi16(tmp1, tmp6); \ -+ out2 = _mm_add_epi16(tmp2, tmp5); \ -+ out3 = _mm_sub_epi16(tmp3, tmp4); \ -+ out4 = _mm_add_epi16(tmp3, tmp4); \ -+ out5 = _mm_sub_epi16(tmp2, tmp5); \ -+ out6 = _mm_sub_epi16(tmp1, tmp6); \ -+ out7 = _mm_sub_epi16(tmp0, tmp7); \ -+} -+ -+ -+void jsimd_idct_ifast_e2k(void *dct_table_, JCOEFPTR coef_block, -+ JSAMPARRAY output_buf, JDIMENSION output_col) -+{ -+ short *dct_table = (short *)dct_table_; -+ -+ __m128i row0, row1, row2, row3, row4, row5, row6, row7, -+ col0, col1, col2, col3, col4, col5, col6, col7, -+ quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7, -+ tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13, -+ z5, z10, z10s, z11, z12s, z13, -+ out0, out1, out2, out3, out4, out5, out6, out7; -+ -+ /* Constants */ -+ __m128i pw_F1414 = _mm_set1_epi16(F_1_414 << CONST_SHIFT), -+ pw_F1847 = _mm_set1_epi16(F_1_847 << CONST_SHIFT), -+ pw_MF1613 = _mm_set1_epi16(-F_1_613 << CONST_SHIFT), -+ pw_F1082 = _mm_set1_epi16(F_1_082 << CONST_SHIFT); -+ -+ /* Pass 1: process columns */ -+ -+ col0 = VEC_LD(coef_block + 0 * 8); -+ col1 = VEC_LD(coef_block + 1 * 8); -+ col2 = VEC_LD(coef_block + 2 * 8); -+ col3 = VEC_LD(coef_block + 3 * 8); -+ col4 = VEC_LD(coef_block + 4 * 8); -+ col5 = VEC_LD(coef_block + 5 * 8); -+ col6 = VEC_LD(coef_block + 6 * 8); -+ col7 = VEC_LD(coef_block + 7 * 8); -+ -+ tmp1 = _mm_or_si128(col1, col2); -+ tmp2 = _mm_or_si128(col3, col4); -+ tmp1 = _mm_or_si128(tmp1, tmp2); -+ tmp3 = _mm_or_si128(col5, col6); -+ tmp3 = _mm_or_si128(tmp3, col7); -+ tmp1 = _mm_or_si128(tmp1, tmp3); -+ -+ quant0 = VEC_LD(dct_table); -+ col0 = _mm_mullo_epi16(col0, quant0); -+ -+ if (VEC_ISZERO(tmp1)) { -+ /* AC terms all zero */ -+ -+ IDCT_SPLAT8(col0); -+ -+ } else { -+ -+ quant1 = VEC_LD(dct_table + 1 * 8); -+ quant2 = VEC_LD(dct_table + 2 * 8); -+ quant3 = VEC_LD(dct_table + 3 * 8); -+ quant4 = VEC_LD(dct_table + 4 * 8); -+ quant5 = VEC_LD(dct_table + 5 * 8); -+ quant6 = VEC_LD(dct_table + 6 * 8); -+ quant7 = VEC_LD(dct_table + 7 * 8); -+ -+ col1 = _mm_mullo_epi16(col1, quant1); -+ col2 = _mm_mullo_epi16(col2, quant2); -+ col3 = _mm_mullo_epi16(col3, quant3); -+ col4 = _mm_mullo_epi16(col4, quant4); -+ col5 = _mm_mullo_epi16(col5, quant5); -+ col6 = _mm_mullo_epi16(col6, quant6); -+ col7 = _mm_mullo_epi16(col7, quant7); -+ -+ DO_IDCT(col); -+ -+ TRANSPOSE(out, row); -+ } -+ -+ /* Pass 2: process rows */ -+ -+ DO_IDCT(row); -+ -+ out0 = _mm_srai_epi16(out0, PASS1_BITS + 3); -+ out1 = _mm_srai_epi16(out1, PASS1_BITS + 3); -+ out2 = _mm_srai_epi16(out2, PASS1_BITS + 3); -+ out3 = _mm_srai_epi16(out3, PASS1_BITS + 3); -+ out4 = _mm_srai_epi16(out4, PASS1_BITS + 3); -+ out5 = _mm_srai_epi16(out5, PASS1_BITS + 3); -+ out6 = _mm_srai_epi16(out6, PASS1_BITS + 3); -+ out7 = _mm_srai_epi16(out7, PASS1_BITS + 3); -+ -+ IDCT_SAVE(); -+} -diff --git a/simd/e2k/jidctint-e2k.c b/simd/e2k/jidctint-e2k.c -new file mode 100644 -index 0000000..7bb79c0 ---- /dev/null -+++ b/simd/e2k/jidctint-e2k.c -@@ -0,0 +1,294 @@ -+/* -+ * Elbrus optimizations for libjpeg-turbo -+ * -+ * Copyright (C) 2014-2015, 2020, D. R. Commander. All Rights Reserved. -+ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd -+ * -+ * This software is provided 'as-is', without any express or implied -+ * warranty. In no event will the authors be held liable for any damages -+ * arising from the use of this software. -+ * -+ * Permission is granted to anyone to use this software for any purpose, -+ * including commercial applications, and to alter it and redistribute it -+ * freely, subject to the following restrictions: -+ * -+ * 1. The origin of this software must not be misrepresented; you must not -+ * claim that you wrote the original software. If you use this software -+ * in a product, an acknowledgment in the product documentation would be -+ * appreciated but is not required. -+ * 2. Altered source versions must be plainly marked as such, and must not be -+ * misrepresented as being the original software. -+ * 3. This notice may not be removed or altered from any source distribution. -+ */ -+ -+/* ACCURATE INTEGER INVERSE DCT */ -+ -+#include "jsimd_e2k.h" -+ -+ -+#define F_0_298 2446 /* FIX(0.298631336) */ -+#define F_0_390 3196 /* FIX(0.390180644) */ -+#define F_0_541 4433 /* FIX(0.541196100) */ -+#define F_0_765 6270 /* FIX(0.765366865) */ -+#define F_0_899 7373 /* FIX(0.899976223) */ -+#define F_1_175 9633 /* FIX(1.175875602) */ -+#define F_1_501 12299 /* FIX(1.501321110) */ -+#define F_1_847 15137 /* FIX(1.847759065) */ -+#define F_1_961 16069 /* FIX(1.961570560) */ -+#define F_2_053 16819 /* FIX(2.053119869) */ -+#define F_2_562 20995 /* FIX(2.562915447) */ -+#define F_3_072 25172 /* FIX(3.072711026) */ -+ -+#define CONST_BITS 13 -+#define PASS1_BITS 2 -+#define DESCALE_P1 (CONST_BITS - PASS1_BITS) -+#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3) -+ -+ -+#define DO_IDCT(in, PASS) { \ -+ /* Even part \ -+ * \ -+ * (Original) \ -+ * z1 = (z2 + z3) * 0.541196100; \ -+ * tmp2 = z1 + z3 * -1.847759065; \ -+ * tmp3 = z1 + z2 * 0.765366865; \ -+ * \ -+ * (This implementation) \ -+ * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \ -+ * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \ -+ */ \ -+ \ -+ in##26l = _mm_unpacklo_epi16(in##2, in##6); \ -+ in##26h = _mm_unpackhi_epi16(in##2, in##6); \ -+ \ -+ tmp3l = _mm_madd_epi16(in##26l, pw_f130_f054); \ -+ tmp3h = _mm_madd_epi16(in##26h, pw_f130_f054); \ -+ tmp2l = _mm_madd_epi16(in##26l, pw_f054_mf130); \ -+ tmp2h = _mm_madd_epi16(in##26h, pw_f054_mf130); \ -+ \ -+ tmp0 = _mm_add_epi16(in##0, in##4); \ -+ tmp1 = _mm_sub_epi16(in##0, in##4); \ -+ \ -+ tmp0l = _mm_unpacklo_epi16(pw_zero, tmp0); \ -+ tmp0h = _mm_unpackhi_epi16(pw_zero, tmp0); \ -+ tmp0l = _mm_srai_epi32(tmp0l, 16 - CONST_BITS); \ -+ tmp0h = _mm_srai_epi32(tmp0h, 16 - CONST_BITS); \ -+ tmp0l = _mm_add_epi32(tmp0l, pd_descale_p##PASS); \ -+ tmp0h = _mm_add_epi32(tmp0h, pd_descale_p##PASS); \ -+ \ -+ tmp10l = _mm_add_epi32(tmp0l, tmp3l); \ -+ tmp10h = _mm_add_epi32(tmp0h, tmp3h); \ -+ tmp13l = _mm_sub_epi32(tmp0l, tmp3l); \ -+ tmp13h = _mm_sub_epi32(tmp0h, tmp3h); \ -+ \ -+ tmp1l = _mm_unpacklo_epi16(pw_zero, tmp1); \ -+ tmp1h = _mm_unpackhi_epi16(pw_zero, tmp1); \ -+ tmp1l = _mm_srai_epi32(tmp1l, 16 - CONST_BITS); \ -+ tmp1h = _mm_srai_epi32(tmp1h, 16 - CONST_BITS); \ -+ tmp1l = _mm_add_epi32(tmp1l, pd_descale_p##PASS); \ -+ tmp1h = _mm_add_epi32(tmp1h, pd_descale_p##PASS); \ -+ \ -+ tmp11l = _mm_add_epi32(tmp1l, tmp2l); \ -+ tmp11h = _mm_add_epi32(tmp1h, tmp2h); \ -+ tmp12l = _mm_sub_epi32(tmp1l, tmp2l); \ -+ tmp12h = _mm_sub_epi32(tmp1h, tmp2h); \ -+ \ -+ /* Odd part */ \ -+ \ -+ z3 = _mm_add_epi16(in##3, in##7); \ -+ z4 = _mm_add_epi16(in##1, in##5); \ -+ \ -+ /* (Original) \ -+ * z5 = (z3 + z4) * 1.175875602; \ -+ * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \ -+ * z3 += z5; z4 += z5; \ -+ * \ -+ * (This implementation) \ -+ * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \ -+ * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \ -+ */ \ -+ \ -+ z34l = _mm_unpacklo_epi16(z3, z4); \ -+ z34h = _mm_unpackhi_epi16(z3, z4); \ -+ \ -+ z3l = _mm_madd_epi16(z34l, pw_mf078_f117); \ -+ z3h = _mm_madd_epi16(z34h, pw_mf078_f117); \ -+ z4l = _mm_madd_epi16(z34l, pw_f117_f078); \ -+ z4h = _mm_madd_epi16(z34h, pw_f117_f078); \ -+ \ -+ /* (Original) \ -+ * z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; \ -+ * tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; \ -+ * tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; \ -+ * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \ -+ * tmp0 += z1 + z3; tmp1 += z2 + z4; \ -+ * tmp2 += z2 + z3; tmp3 += z1 + z4; \ -+ * \ -+ * (This implementation) \ -+ * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; \ -+ * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; \ -+ * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); \ -+ * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); \ -+ * tmp0 += z3; tmp1 += z4; \ -+ * tmp2 += z3; tmp3 += z4; \ -+ */ \ -+ \ -+ in##71l = _mm_unpacklo_epi16(in##7, in##1); \ -+ in##71h = _mm_unpackhi_epi16(in##7, in##1); \ -+ \ -+ tmp0l = _mm_add_epi32(_mm_madd_epi16(in##71l, pw_mf060_mf089), z3l); \ -+ tmp0h = _mm_add_epi32(_mm_madd_epi16(in##71h, pw_mf060_mf089), z3h); \ -+ tmp3l = _mm_add_epi32(_mm_madd_epi16(in##71l, pw_mf089_f060), z4l); \ -+ tmp3h = _mm_add_epi32(_mm_madd_epi16(in##71h, pw_mf089_f060), z4h); \ -+ \ -+ in##53l = _mm_unpacklo_epi16(in##5, in##3); \ -+ in##53h = _mm_unpackhi_epi16(in##5, in##3); \ -+ \ -+ tmp1l = _mm_add_epi32(_mm_madd_epi16(in##53l, pw_mf050_mf256), z4l); \ -+ tmp1h = _mm_add_epi32(_mm_madd_epi16(in##53h, pw_mf050_mf256), z4h); \ -+ tmp2l = _mm_add_epi32(_mm_madd_epi16(in##53l, pw_mf256_f050), z3l); \ -+ tmp2h = _mm_add_epi32(_mm_madd_epi16(in##53h, pw_mf256_f050), z3h); \ -+ \ -+ /* Final output stage */ \ -+ \ -+ out0l = _mm_add_epi32(tmp10l, tmp3l); \ -+ out0h = _mm_add_epi32(tmp10h, tmp3h); \ -+ out7l = _mm_sub_epi32(tmp10l, tmp3l); \ -+ out7h = _mm_sub_epi32(tmp10h, tmp3h); \ -+ \ -+ out0l = _mm_srai_epi32(out0l, DESCALE_P##PASS); \ -+ out0h = _mm_srai_epi32(out0h, DESCALE_P##PASS); \ -+ out7l = _mm_srai_epi32(out7l, DESCALE_P##PASS); \ -+ out7h = _mm_srai_epi32(out7h, DESCALE_P##PASS); \ -+ \ -+ out0 = _mm_packs_epi32(out0l, out0h); \ -+ out7 = _mm_packs_epi32(out7l, out7h); \ -+ \ -+ out1l = _mm_add_epi32(tmp11l, tmp2l); \ -+ out1h = _mm_add_epi32(tmp11h, tmp2h); \ -+ out6l = _mm_sub_epi32(tmp11l, tmp2l); \ -+ out6h = _mm_sub_epi32(tmp11h, tmp2h); \ -+ \ -+ out1l = _mm_srai_epi32(out1l, DESCALE_P##PASS); \ -+ out1h = _mm_srai_epi32(out1h, DESCALE_P##PASS); \ -+ out6l = _mm_srai_epi32(out6l, DESCALE_P##PASS); \ -+ out6h = _mm_srai_epi32(out6h, DESCALE_P##PASS); \ -+ \ -+ out1 = _mm_packs_epi32(out1l, out1h); \ -+ out6 = _mm_packs_epi32(out6l, out6h); \ -+ \ -+ out2l = _mm_add_epi32(tmp12l, tmp1l); \ -+ out2h = _mm_add_epi32(tmp12h, tmp1h); \ -+ out5l = _mm_sub_epi32(tmp12l, tmp1l); \ -+ out5h = _mm_sub_epi32(tmp12h, tmp1h); \ -+ \ -+ out2l = _mm_srai_epi32(out2l, DESCALE_P##PASS); \ -+ out2h = _mm_srai_epi32(out2h, DESCALE_P##PASS); \ -+ out5l = _mm_srai_epi32(out5l, DESCALE_P##PASS); \ -+ out5h = _mm_srai_epi32(out5h, DESCALE_P##PASS); \ -+ \ -+ out2 = _mm_packs_epi32(out2l, out2h); \ -+ out5 = _mm_packs_epi32(out5l, out5h); \ -+ \ -+ out3l = _mm_add_epi32(tmp13l, tmp0l); \ -+ out3h = _mm_add_epi32(tmp13h, tmp0h); \ -+ out4l = _mm_sub_epi32(tmp13l, tmp0l); \ -+ out4h = _mm_sub_epi32(tmp13h, tmp0h); \ -+ \ -+ out3l = _mm_srai_epi32(out3l, DESCALE_P##PASS); \ -+ out3h = _mm_srai_epi32(out3h, DESCALE_P##PASS); \ -+ out4l = _mm_srai_epi32(out4l, DESCALE_P##PASS); \ -+ out4h = _mm_srai_epi32(out4h, DESCALE_P##PASS); \ -+ \ -+ out3 = _mm_packs_epi32(out3l, out3h); \ -+ out4 = _mm_packs_epi32(out4l, out4h); \ -+} -+ -+ -+void jsimd_idct_islow_e2k(void *dct_table_, JCOEFPTR coef_block, -+ JSAMPARRAY output_buf, JDIMENSION output_col) -+{ -+ short *dct_table = (short *)dct_table_; -+ -+ __m128i row0, row1, row2, row3, row4, row5, row6, row7, -+ col0, col1, col2, col3, col4, col5, col6, col7, -+ quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7, -+ tmp0, tmp1, tmp2, tmp3, z3, z4, -+ z34l, z34h, col71l, col71h, col26l, col26h, col53l, col53h, -+ row71l, row71h, row26l, row26h, row53l, row53h, -+ out0, out1, out2, out3, out4, out5, out6, out7; -+ __m128i tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h, -+ tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h, -+ z3l, z3h, z4l, z4h, -+ out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h, out4l, out4h, -+ out5l, out5h, out6l, out6h, out7l, out7h; -+ -+ /* Constants */ -+ __m128i pw_zero = _mm_setzero_si128(), -+ pw_f130_f054 = _mm_setr_epi16(__4X2(F_0_541 + F_0_765, F_0_541)), -+ pw_f054_mf130 = _mm_setr_epi16(__4X2(F_0_541, F_0_541 - F_1_847)), -+ pw_mf078_f117 = _mm_setr_epi16(__4X2(F_1_175 - F_1_961, F_1_175)), -+ pw_f117_f078 = _mm_setr_epi16(__4X2(F_1_175, F_1_175 - F_0_390)), -+ pw_mf060_mf089 = _mm_setr_epi16(__4X2(F_0_298 - F_0_899, -F_0_899)), -+ pw_mf089_f060 = _mm_setr_epi16(__4X2(-F_0_899, F_1_501 - F_0_899)), -+ pw_mf050_mf256 = _mm_setr_epi16(__4X2(F_2_053 - F_2_562, -F_2_562)), -+ pw_mf256_f050 = _mm_setr_epi16(__4X2(-F_2_562, F_3_072 - F_2_562)), -+ pd_descale_p1 = _mm_set1_epi32(1 << (DESCALE_P1 - 1)), -+ pd_descale_p2 = _mm_set1_epi32(1 << (DESCALE_P2 - 1)); -+ -+ /* Pass 1: process columns */ -+ -+ col0 = VEC_LD(coef_block + 0 * 8); -+ col1 = VEC_LD(coef_block + 1 * 8); -+ col2 = VEC_LD(coef_block + 2 * 8); -+ col3 = VEC_LD(coef_block + 3 * 8); -+ col4 = VEC_LD(coef_block + 4 * 8); -+ col5 = VEC_LD(coef_block + 5 * 8); -+ col6 = VEC_LD(coef_block + 6 * 8); -+ col7 = VEC_LD(coef_block + 7 * 8); -+ -+ tmp1 = _mm_or_si128(col1, col2); -+ tmp2 = _mm_or_si128(col3, col4); -+ tmp1 = _mm_or_si128(tmp1, tmp2); -+ tmp3 = _mm_or_si128(col5, col6); -+ tmp3 = _mm_or_si128(tmp3, col7); -+ tmp1 = _mm_or_si128(tmp1, tmp3); -+ -+ quant0 = VEC_LD(dct_table); -+ col0 = _mm_mullo_epi16(col0, quant0); -+ -+ if (VEC_ISZERO(tmp1)) { -+ /* AC terms all zero */ -+ -+ col0 = _mm_slli_epi16(col0, PASS1_BITS); -+ IDCT_SPLAT8(col0); -+ -+ } else { -+ -+ quant1 = VEC_LD(dct_table + 1 * 8); -+ quant2 = VEC_LD(dct_table + 2 * 8); -+ quant3 = VEC_LD(dct_table + 3 * 8); -+ quant4 = VEC_LD(dct_table + 4 * 8); -+ quant5 = VEC_LD(dct_table + 5 * 8); -+ quant6 = VEC_LD(dct_table + 6 * 8); -+ quant7 = VEC_LD(dct_table + 7 * 8); -+ -+ col1 = _mm_mullo_epi16(col1, quant1); -+ col2 = _mm_mullo_epi16(col2, quant2); -+ col3 = _mm_mullo_epi16(col3, quant3); -+ col4 = _mm_mullo_epi16(col4, quant4); -+ col5 = _mm_mullo_epi16(col5, quant5); -+ col6 = _mm_mullo_epi16(col6, quant6); -+ col7 = _mm_mullo_epi16(col7, quant7); -+ -+ DO_IDCT(col, 1); -+ -+ TRANSPOSE(out, row); -+ } -+ -+ /* Pass 2: process rows */ -+ -+ DO_IDCT(row, 2); -+ -+ IDCT_SAVE(); -+} -diff --git a/simd/e2k/jquantf-e2k.c b/simd/e2k/jquantf-e2k.c -new file mode 100644 -index 0000000..106e99a ---- /dev/null -+++ b/simd/e2k/jquantf-e2k.c -@@ -0,0 +1,121 @@ -+/* -+ * Elbrus optimizations for libjpeg-turbo -+ * -+ * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. -+ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd -+ * -+ * This software is provided 'as-is', without any express or implied -+ * warranty. In no event will the authors be held liable for any damages -+ * arising from the use of this software. -+ * -+ * Permission is granted to anyone to use this software for any purpose, -+ * including commercial applications, and to alter it and redistribute it -+ * freely, subject to the following restrictions: -+ * -+ * 1. The origin of this software must not be misrepresented; you must not -+ * claim that you wrote the original software. If you use this software -+ * in a product, an acknowledgment in the product documentation would be -+ * appreciated but is not required. -+ * 2. Altered source versions must be plainly marked as such, and must not be -+ * misrepresented as being the original software. -+ * 3. This notice may not be removed or altered from any source distribution. -+ */ -+ -+/* FLOAT QUANTIZATION AND SAMPLE CONVERSION */ -+ -+#include "jsimd_e2k.h" -+ -+#define LOAD_ROW(row) in##row = VEC_LD8(sample_data[row] + start_col) -+#define STORE_ROW(i) \ -+ in0 = _mm_unpacklo_epi16(out##i, pb_zero); \ -+ in1 = _mm_unpackhi_epi16(out##i, pb_zero); \ -+ in0 = _mm_sub_epi32(in0, pd_cj); \ -+ in1 = _mm_sub_epi32(in1, pd_cj); \ -+ _mm_storeu_ps(workspace + i * 8, _mm_cvtepi32_ps(in0)); \ -+ _mm_storeu_ps(workspace + i * 8 + 4, _mm_cvtepi32_ps(in1)); -+ -+void jsimd_convsamp_float_e2k(JSAMPARRAY sample_data, JDIMENSION start_col, -+ FAST_FLOAT *workspace) -+{ -+ __m128i in0, in1, in2, in3, in4, in5, in6, in7; -+ __m128i out0, out1, out2, out3, out4, out5, out6, out7; -+ -+ /* Constants */ -+ __m128i pd_cj = _mm_set1_epi32(CENTERJSAMPLE), -+ pb_zero = _mm_setzero_si128(); -+ -+ LOAD_ROW(0); -+ LOAD_ROW(1); -+ LOAD_ROW(2); -+ LOAD_ROW(3); -+ LOAD_ROW(4); -+ LOAD_ROW(5); -+ LOAD_ROW(6); -+ LOAD_ROW(7); -+ -+ out0 = _mm_unpacklo_epi8(in0, pb_zero); -+ out1 = _mm_unpacklo_epi8(in1, pb_zero); -+ out2 = _mm_unpacklo_epi8(in2, pb_zero); -+ out3 = _mm_unpacklo_epi8(in3, pb_zero); -+ out4 = _mm_unpacklo_epi8(in4, pb_zero); -+ out5 = _mm_unpacklo_epi8(in5, pb_zero); -+ out6 = _mm_unpacklo_epi8(in6, pb_zero); -+ out7 = _mm_unpacklo_epi8(in7, pb_zero); -+ -+ STORE_ROW(0) -+ STORE_ROW(1) -+ STORE_ROW(2) -+ STORE_ROW(3) -+ STORE_ROW(4) -+ STORE_ROW(5) -+ STORE_ROW(6) -+ STORE_ROW(7) -+} -+ -+void jsimd_quantize_float_e2k(JCOEFPTR coef_block, FAST_FLOAT *divisors, -+ FAST_FLOAT *workspace) -+{ -+ int i = 0; -+ __m128 row0, row1, row2, row3, recip0, recip1, recip2, recip3; -+ __m128i out0, out1; -+#ifdef JSIMD_SAME_ROUNDING -+ __m128 pd_f16k5 = _mm_set1_ps(16384.5f); -+ __m128i pw_m16k = _mm_set1_epi16(-16384); -+#endif -+ -+ PRAGMA_E2K("ivdep") -+ for (; i < 4; i++, workspace += 16, divisors += 16, coef_block += 16) { -+ row0 = _mm_loadu_ps(workspace + 0 * 4); -+ row1 = _mm_loadu_ps(workspace + 1 * 4); -+ row2 = _mm_loadu_ps(workspace + 2 * 4); -+ row3 = _mm_loadu_ps(workspace + 3 * 4); -+ -+ recip0 = _mm_loadu_ps(divisors + 0 * 4); -+ recip1 = _mm_loadu_ps(divisors + 1 * 4); -+ recip2 = _mm_loadu_ps(divisors + 2 * 4); -+ recip3 = _mm_loadu_ps(divisors + 3 * 4); -+ -+ row0 = _mm_mul_ps(row0, recip0); -+ row1 = _mm_mul_ps(row1, recip1); -+ row2 = _mm_mul_ps(row2, recip2); -+ row3 = _mm_mul_ps(row3, recip3); -+ -+#ifdef JSIMD_SAME_ROUNDING -+ row0 = _mm_add_ps(row0, pd_f16k5); -+ row1 = _mm_add_ps(row1, pd_f16k5); -+ row2 = _mm_add_ps(row2, pd_f16k5); -+ row3 = _mm_add_ps(row3, pd_f16k5); -+ -+ out0 = _mm_packs_epi32(_mm_cvttps_epi32(row0), _mm_cvttps_epi32(row1)); -+ out1 = _mm_packs_epi32(_mm_cvttps_epi32(row2), _mm_cvttps_epi32(row3)); -+ -+ out0 = _mm_add_epi16(out0, pw_m16k); -+ out1 = _mm_add_epi16(out1, pw_m16k); -+#else -+ out0 = _mm_packs_epi32(_mm_cvtps_epi32(row0), _mm_cvtps_epi32(row1)); -+ out1 = _mm_packs_epi32(_mm_cvtps_epi32(row2), _mm_cvtps_epi32(row3)); -+#endif -+ VEC_ST(coef_block, out0); -+ VEC_ST(coef_block + 8, out1); -+ } -+} -diff --git a/simd/e2k/jquanti-e2k.c b/simd/e2k/jquanti-e2k.c -new file mode 100644 -index 0000000..a3e1ff1 ---- /dev/null -+++ b/simd/e2k/jquanti-e2k.c -@@ -0,0 +1,178 @@ -+/* -+ * Elbrus optimizations for libjpeg-turbo -+ * -+ * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. -+ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd -+ * -+ * This software is provided 'as-is', without any express or implied -+ * warranty. In no event will the authors be held liable for any damages -+ * arising from the use of this software. -+ * -+ * Permission is granted to anyone to use this software for any purpose, -+ * including commercial applications, and to alter it and redistribute it -+ * freely, subject to the following restrictions: -+ * -+ * 1. The origin of this software must not be misrepresented; you must not -+ * claim that you wrote the original software. If you use this software -+ * in a product, an acknowledgment in the product documentation would be -+ * appreciated but is not required. -+ * 2. Altered source versions must be plainly marked as such, and must not be -+ * misrepresented as being the original software. -+ * 3. This notice may not be removed or altered from any source distribution. -+ */ -+ -+/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */ -+ -+#include "jsimd_e2k.h" -+ -+#define LOAD_ROW(row) in##row = VEC_LD8(sample_data[row] + start_col) -+ -+void jsimd_convsamp_e2k(JSAMPARRAY sample_data, JDIMENSION start_col, -+ DCTELEM *workspace) -+{ -+ __m128i in0, in1, in2, in3, in4, in5, in6, in7; -+ __m128i out0, out1, out2, out3, out4, out5, out6, out7; -+ -+ /* Constants */ -+ __m128i pw_cj = _mm_set1_epi16(CENTERJSAMPLE), -+ pb_zero = _mm_setzero_si128(); -+ -+ LOAD_ROW(0); -+ LOAD_ROW(1); -+ LOAD_ROW(2); -+ LOAD_ROW(3); -+ LOAD_ROW(4); -+ LOAD_ROW(5); -+ LOAD_ROW(6); -+ LOAD_ROW(7); -+ -+ out0 = _mm_unpacklo_epi8(in0, pb_zero); -+ out1 = _mm_unpacklo_epi8(in1, pb_zero); -+ out2 = _mm_unpacklo_epi8(in2, pb_zero); -+ out3 = _mm_unpacklo_epi8(in3, pb_zero); -+ out4 = _mm_unpacklo_epi8(in4, pb_zero); -+ out5 = _mm_unpacklo_epi8(in5, pb_zero); -+ out6 = _mm_unpacklo_epi8(in6, pb_zero); -+ out7 = _mm_unpacklo_epi8(in7, pb_zero); -+ -+ out0 = _mm_sub_epi16(out0, pw_cj); -+ out1 = _mm_sub_epi16(out1, pw_cj); -+ out2 = _mm_sub_epi16(out2, pw_cj); -+ out3 = _mm_sub_epi16(out3, pw_cj); -+ out4 = _mm_sub_epi16(out4, pw_cj); -+ out5 = _mm_sub_epi16(out5, pw_cj); -+ out6 = _mm_sub_epi16(out6, pw_cj); -+ out7 = _mm_sub_epi16(out7, pw_cj); -+ -+ VEC_ST(workspace + 0 * 8, out0); -+ VEC_ST(workspace + 1 * 8, out1); -+ VEC_ST(workspace + 2 * 8, out2); -+ VEC_ST(workspace + 3 * 8, out3); -+ VEC_ST(workspace + 4 * 8, out4); -+ VEC_ST(workspace + 5 * 8, out5); -+ VEC_ST(workspace + 6 * 8, out6); -+ VEC_ST(workspace + 7 * 8, out7); -+} -+ -+ -+#define WORD_BIT 16 -+#define MULTIPLY(vs0, vs1, out) out = _mm_mulhi_epu16(vs0, vs1) -+ -+void jsimd_quantize_e2k(JCOEFPTR coef_block, DCTELEM *divisors, -+ DCTELEM *workspace) -+{ -+ __m128i row0, row1, row2, row3, row4, row5, row6, row7, -+ row0s, row1s, row2s, row3s, row4s, row5s, row6s, row7s, -+ corr0, corr1, corr2, corr3, corr4, corr5, corr6, corr7, -+ recip0, recip1, recip2, recip3, recip4, recip5, recip6, recip7, -+ scale0, scale1, scale2, scale3, scale4, scale5, scale6, scale7; -+ -+ row0s = VEC_LD(workspace + 0 * 8); -+ row1s = VEC_LD(workspace + 1 * 8); -+ row2s = VEC_LD(workspace + 2 * 8); -+ row3s = VEC_LD(workspace + 3 * 8); -+ row4s = VEC_LD(workspace + 4 * 8); -+ row5s = VEC_LD(workspace + 5 * 8); -+ row6s = VEC_LD(workspace + 6 * 8); -+ row7s = VEC_LD(workspace + 7 * 8); -+ row0 = _mm_abs_epi16(row0s); -+ row1 = _mm_abs_epi16(row1s); -+ row2 = _mm_abs_epi16(row2s); -+ row3 = _mm_abs_epi16(row3s); -+ row4 = _mm_abs_epi16(row4s); -+ row5 = _mm_abs_epi16(row5s); -+ row6 = _mm_abs_epi16(row6s); -+ row7 = _mm_abs_epi16(row7s); -+ -+ corr0 = VEC_LD(divisors + DCTSIZE2 + 0 * 8); -+ corr1 = VEC_LD(divisors + DCTSIZE2 + 1 * 8); -+ corr2 = VEC_LD(divisors + DCTSIZE2 + 2 * 8); -+ corr3 = VEC_LD(divisors + DCTSIZE2 + 3 * 8); -+ corr4 = VEC_LD(divisors + DCTSIZE2 + 4 * 8); -+ corr5 = VEC_LD(divisors + DCTSIZE2 + 5 * 8); -+ corr6 = VEC_LD(divisors + DCTSIZE2 + 6 * 8); -+ corr7 = VEC_LD(divisors + DCTSIZE2 + 7 * 8); -+ -+ row0 = _mm_add_epi16(row0, corr0); -+ row1 = _mm_add_epi16(row1, corr1); -+ row2 = _mm_add_epi16(row2, corr2); -+ row3 = _mm_add_epi16(row3, corr3); -+ row4 = _mm_add_epi16(row4, corr4); -+ row5 = _mm_add_epi16(row5, corr5); -+ row6 = _mm_add_epi16(row6, corr6); -+ row7 = _mm_add_epi16(row7, corr7); -+ -+ recip0 = VEC_LD(divisors + 0 * 8); -+ recip1 = VEC_LD(divisors + 1 * 8); -+ recip2 = VEC_LD(divisors + 2 * 8); -+ recip3 = VEC_LD(divisors + 3 * 8); -+ recip4 = VEC_LD(divisors + 4 * 8); -+ recip5 = VEC_LD(divisors + 5 * 8); -+ recip6 = VEC_LD(divisors + 6 * 8); -+ recip7 = VEC_LD(divisors + 7 * 8); -+ -+ MULTIPLY(row0, recip0, row0); -+ MULTIPLY(row1, recip1, row1); -+ MULTIPLY(row2, recip2, row2); -+ MULTIPLY(row3, recip3, row3); -+ MULTIPLY(row4, recip4, row4); -+ MULTIPLY(row5, recip5, row5); -+ MULTIPLY(row6, recip6, row6); -+ MULTIPLY(row7, recip7, row7); -+ -+ scale0 = VEC_LD(divisors + DCTSIZE2 * 2 + 0 * 8); -+ scale1 = VEC_LD(divisors + DCTSIZE2 * 2 + 1 * 8); -+ scale2 = VEC_LD(divisors + DCTSIZE2 * 2 + 2 * 8); -+ scale3 = VEC_LD(divisors + DCTSIZE2 * 2 + 3 * 8); -+ scale4 = VEC_LD(divisors + DCTSIZE2 * 2 + 4 * 8); -+ scale5 = VEC_LD(divisors + DCTSIZE2 * 2 + 5 * 8); -+ scale6 = VEC_LD(divisors + DCTSIZE2 * 2 + 6 * 8); -+ scale7 = VEC_LD(divisors + DCTSIZE2 * 2 + 7 * 8); -+ -+ MULTIPLY(row0, scale0, row0); -+ MULTIPLY(row1, scale1, row1); -+ MULTIPLY(row2, scale2, row2); -+ MULTIPLY(row3, scale3, row3); -+ MULTIPLY(row4, scale4, row4); -+ MULTIPLY(row5, scale5, row5); -+ MULTIPLY(row6, scale6, row6); -+ MULTIPLY(row7, scale7, row7); -+ -+ row0 = _mm_sign_epi16(row0, row0s); -+ row1 = _mm_sign_epi16(row1, row1s); -+ row2 = _mm_sign_epi16(row2, row2s); -+ row3 = _mm_sign_epi16(row3, row3s); -+ row4 = _mm_sign_epi16(row4, row4s); -+ row5 = _mm_sign_epi16(row5, row5s); -+ row6 = _mm_sign_epi16(row6, row6s); -+ row7 = _mm_sign_epi16(row7, row7s); -+ -+ VEC_ST(coef_block + 0 * 8, row0); -+ VEC_ST(coef_block + 1 * 8, row1); -+ VEC_ST(coef_block + 2 * 8, row2); -+ VEC_ST(coef_block + 3 * 8, row3); -+ VEC_ST(coef_block + 4 * 8, row4); -+ VEC_ST(coef_block + 5 * 8, row5); -+ VEC_ST(coef_block + 6 * 8, row6); -+ VEC_ST(coef_block + 7 * 8, row7); -+} -diff --git a/simd/e2k/jsimd.c b/simd/e2k/jsimd.c -new file mode 100644 -index 0000000..f8c0465 ---- /dev/null -+++ b/simd/e2k/jsimd.c -@@ -0,0 +1,761 @@ -+/* -+ * jsimd_e2k.c -+ * -+ * Copyright 2009 Pierre Ossman for Cendio AB -+ * Copyright (C) 2009-2011, 2014-2016, 2018, D. R. Commander. -+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois. -+ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd -+ * -+ * Based on the x86 SIMD extension for IJG JPEG library, -+ * Copyright (C) 1999-2006, MIYASAKA Masaru. -+ * For conditions of distribution and use, see copyright notice in jsimdext.inc -+ * -+ * This file contains the interface between the "normal" portions -+ * of the library and the SIMD implementations when running on a -+ * PowerPC architecture. -+ */ -+ -+#define JPEG_INTERNALS -+#include "../../jinclude.h" -+#include "../../jpeglib.h" -+#include "../../jsimd.h" -+#include "../../jdct.h" -+#include "../../jsimddct.h" -+#include "../jsimd.h" -+#include "jsimd_api_e2k.h" -+ -+static unsigned int simd_support = ~0; -+static unsigned int simd_huffman = 1; -+ -+/* -+ * Check what SIMD accelerations are supported. -+ * -+ * FIXME: This code is racy under a multi-threaded environment. -+ */ -+LOCAL(void) -+init_simd(void) -+{ -+#ifndef NO_GETENV -+ char *env = NULL; -+#endif -+ -+ if (simd_support != ~0U) -+ return; -+ -+ simd_support = JSIMD_SSE2; -+ -+#ifndef NO_GETENV -+ /* Force different settings through environment variables */ -+ env = getenv("JSIMD_FORCENONE"); -+ if ((env != NULL) && (strcmp(env, "1") == 0)) -+ simd_support = 0; -+ env = getenv("JSIMD_NOHUFFENC"); -+ if ((env != NULL) && (strcmp(env, "1") == 0)) -+ simd_huffman = 0; -+#endif -+} -+ -+static inline int color_space_idx(J_COLOR_SPACE color_space) { -+ switch (color_space) { -+ case JCS_EXT_RGB: -+ return 1 + (EXT_RGB_PIXELSIZE != 3) * 16; -+ case JCS_EXT_RGBX: -+ case JCS_EXT_RGBA: -+ return 2 + (EXT_RGBX_PIXELSIZE != 3) * 16; -+ case JCS_EXT_BGR: -+ return 3 + (EXT_BGR_PIXELSIZE != 3) * 16; -+ case JCS_EXT_BGRX: -+ case JCS_EXT_BGRA: -+ return 4 + (EXT_BGRX_PIXELSIZE != 3) * 16; -+ case JCS_EXT_XBGR: -+ case JCS_EXT_ABGR: -+ return 5 + (EXT_XBGR_PIXELSIZE != 3) * 16; -+ case JCS_EXT_XRGB: -+ case JCS_EXT_ARGB: -+ return 6 + (EXT_XRGB_PIXELSIZE != 3) * 16; -+ default: -+ break; -+ } -+ return 0 + (RGB_PIXELSIZE != 3) * 16; -+} -+ -+GLOBAL(int) -+jsimd_can_rgb_ycc(void) -+{ -+ init_simd(); -+ -+ /* The code is optimised for these values only */ -+ if (BITS_IN_JSAMPLE != 8) -+ return 0; -+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) -+ return 0; -+ -+ if (simd_support & JSIMD_SSE2) -+ return 1; -+ -+ return 0; -+} -+ -+GLOBAL(int) -+jsimd_can_rgb_gray(void) -+{ -+ init_simd(); -+ -+ /* The code is optimised for these values only */ -+ if (BITS_IN_JSAMPLE != 8) -+ return 0; -+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) -+ return 0; -+ -+ if (simd_support & JSIMD_SSE2) -+ return 1; -+ -+ return 0; -+} -+ -+GLOBAL(int) -+jsimd_can_ycc_rgb(void) -+{ -+ init_simd(); -+ -+ /* The code is optimised for these values only */ -+ if (BITS_IN_JSAMPLE != 8) -+ return 0; -+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) -+ return 0; -+ -+ if (simd_support & JSIMD_SSE2) -+ return 1; -+ -+ return 0; -+} -+ -+GLOBAL(int) -+jsimd_can_ycc_rgb565(void) -+{ -+ return 0; -+} -+ -+GLOBAL(void) -+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, -+ JSAMPIMAGE output_buf, JDIMENSION output_row, -+ int num_rows) -+{ -+ void (*e2kfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int, int); -+ int idx = color_space_idx(cinfo->in_color_space); -+ -+ e2kfct = idx < 16 ? jsimd_rgb3_ycc_convert_e2k : -+ jsimd_rgb4_ycc_convert_e2k; -+ idx &= 15; -+ -+ e2kfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows, idx); -+} -+ -+GLOBAL(void) -+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, -+ JSAMPIMAGE output_buf, JDIMENSION output_row, -+ int num_rows) -+{ -+ void (*e2kfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int, int); -+ int idx = color_space_idx(cinfo->in_color_space); -+ -+ e2kfct = idx < 16 ? jsimd_rgb3_gray_convert_e2k : -+ jsimd_rgb4_gray_convert_e2k; -+ idx &= 15; -+ -+ e2kfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows, idx); -+} -+ -+GLOBAL(void) -+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, -+ JDIMENSION input_row, JSAMPARRAY output_buf, -+ int num_rows) -+{ -+ void (*e2kfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int, int); -+ int idx = color_space_idx(cinfo->out_color_space); -+ -+ e2kfct = idx < 16 ? jsimd_ycc_rgb3_convert_e2k : -+ jsimd_ycc_rgb4_convert_e2k; -+ idx &= 15; -+ -+ e2kfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows, idx); -+} -+ -+GLOBAL(void) -+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, -+ JDIMENSION input_row, JSAMPARRAY output_buf, -+ int num_rows) -+{ -+} -+ -+GLOBAL(int) -+jsimd_can_h2v2_downsample(void) -+{ -+ init_simd(); -+ -+ /* The code is optimised for these values only */ -+ if (BITS_IN_JSAMPLE != 8) -+ return 0; -+ -+ if (simd_support & JSIMD_SSE2) -+ return 1; -+ -+ return 0; -+} -+ -+GLOBAL(int) -+jsimd_can_h2v1_downsample(void) -+{ -+ init_simd(); -+ -+ /* The code is optimised for these values only */ -+ if (BITS_IN_JSAMPLE != 8) -+ return 0; -+ -+ if (simd_support & JSIMD_SSE2) -+ return 1; -+ -+ return 0; -+} -+ -+GLOBAL(void) -+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, -+ JSAMPARRAY input_data, JSAMPARRAY output_data) -+{ -+ jsimd_h2v2_downsample_e2k(cinfo->image_width, cinfo->max_v_samp_factor, -+ compptr->v_samp_factor, -+ compptr->width_in_blocks, input_data, -+ output_data); -+} -+ -+GLOBAL(void) -+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, -+ JSAMPARRAY input_data, JSAMPARRAY output_data) -+{ -+ jsimd_h2v1_downsample_e2k(cinfo->image_width, cinfo->max_v_samp_factor, -+ compptr->v_samp_factor, -+ compptr->width_in_blocks, input_data, -+ output_data); -+} -+ -+GLOBAL(int) -+jsimd_can_h2v2_upsample(void) -+{ -+ init_simd(); -+ -+ /* The code is optimised for these values only */ -+ if (BITS_IN_JSAMPLE != 8) -+ return 0; -+ -+ if (simd_support & JSIMD_SSE2) -+ return 1; -+ -+ return 0; -+} -+ -+GLOBAL(int) -+jsimd_can_h2v1_upsample(void) -+{ -+ init_simd(); -+ -+ /* The code is optimised for these values only */ -+ if (BITS_IN_JSAMPLE != 8) -+ return 0; -+ -+ if (simd_support & JSIMD_SSE2) -+ return 1; -+ -+ return 0; -+} -+ -+GLOBAL(void) -+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, -+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) -+{ -+ jsimd_h2v2_upsample_e2k(cinfo->max_v_samp_factor, cinfo->output_width, -+ input_data, output_data_ptr); -+} -+ -+GLOBAL(void) -+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, -+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) -+{ -+ jsimd_h2v1_upsample_e2k(cinfo->max_v_samp_factor, cinfo->output_width, -+ input_data, output_data_ptr); -+} -+ -+GLOBAL(int) -+jsimd_can_h2v2_fancy_upsample(void) -+{ -+ init_simd(); -+ -+ /* The code is optimised for these values only */ -+ if (BITS_IN_JSAMPLE != 8) -+ return 0; -+ -+ if (simd_support & JSIMD_SSE2) -+ return 1; -+ -+ return 0; -+} -+ -+GLOBAL(int) -+jsimd_can_h2v1_fancy_upsample(void) -+{ -+ init_simd(); -+ -+ /* The code is optimised for these values only */ -+ if (BITS_IN_JSAMPLE != 8) -+ return 0; -+ -+ if (simd_support & JSIMD_SSE2) -+ return 1; -+ -+ return 0; -+} -+ -+GLOBAL(void) -+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, -+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) -+{ -+ jsimd_h2v2_fancy_upsample_e2k(cinfo->max_v_samp_factor, -+ compptr->downsampled_width, input_data, -+ output_data_ptr); -+} -+ -+GLOBAL(void) -+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, -+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) -+{ -+ jsimd_h2v1_fancy_upsample_e2k(cinfo->max_v_samp_factor, -+ compptr->downsampled_width, input_data, -+ output_data_ptr); -+} -+ -+GLOBAL(int) -+jsimd_can_h2v2_merged_upsample(void) -+{ -+ init_simd(); -+ -+ /* The code is optimised for these values only */ -+ if (BITS_IN_JSAMPLE != 8) -+ return 0; -+ -+ if (simd_support & JSIMD_SSE2) -+ return 1; -+ -+ return 0; -+} -+ -+GLOBAL(int) -+jsimd_can_h2v1_merged_upsample(void) -+{ -+ init_simd(); -+ -+ /* The code is optimised for these values only */ -+ if (BITS_IN_JSAMPLE != 8) -+ return 0; -+ -+ if (simd_support & JSIMD_SSE2) -+ return 1; -+ -+ return 0; -+} -+ -+GLOBAL(void) -+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, -+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf) -+{ -+ void (*e2kfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JDIMENSION, JSAMPARRAY, int); -+ int idx = color_space_idx(cinfo->out_color_space); -+ -+ e2kfct = idx < 16 ? jsimd_ycc_rgb3_merged_upsample_e2k : -+ jsimd_ycc_rgb4_merged_upsample_e2k; -+ idx &= 15; -+ -+ e2kfct(cinfo->output_width, input_buf, in_row_group_ctr, -+ in_row_group_ctr * 2, output_buf, idx); -+ e2kfct(cinfo->output_width, input_buf, in_row_group_ctr, -+ in_row_group_ctr * 2 + 1, output_buf + 1, idx); -+} -+ -+GLOBAL(void) -+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, -+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf) -+{ -+ void (*e2kfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JDIMENSION, JSAMPARRAY, int); -+ int idx = color_space_idx(cinfo->out_color_space); -+ -+ e2kfct = idx < 16 ? jsimd_ycc_rgb3_merged_upsample_e2k : -+ jsimd_ycc_rgb4_merged_upsample_e2k; -+ idx &= 15; -+ -+ e2kfct(cinfo->output_width, input_buf, in_row_group_ctr, -+ in_row_group_ctr, output_buf, idx); -+} -+ -+GLOBAL(int) -+jsimd_can_convsamp(void) -+{ -+ init_simd(); -+ -+ /* The code is optimised for these values only */ -+ if (DCTSIZE != 8) -+ return 0; -+ if (BITS_IN_JSAMPLE != 8) -+ return 0; -+ if (sizeof(DCTELEM) != 2) -+ return 0; -+ -+ if (simd_support & JSIMD_SSE2) -+ return 1; -+ -+ return 0; -+} -+ -+GLOBAL(int) -+jsimd_can_convsamp_float(void) -+{ -+ init_simd(); -+ -+ /* The code is optimised for these values only */ -+ if (DCTSIZE != 8) -+ return 0; -+ if (BITS_IN_JSAMPLE != 8) -+ return 0; -+ if (sizeof(FAST_FLOAT) != 4) -+ return 0; -+ -+ if (simd_support & JSIMD_SSE2) -+ return 1; -+ -+ return 0; -+} -+ -+GLOBAL(void) -+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col, -+ DCTELEM *workspace) -+{ -+ jsimd_convsamp_e2k(sample_data, start_col, workspace); -+} -+ -+GLOBAL(void) -+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col, -+ FAST_FLOAT *workspace) -+{ -+ jsimd_convsamp_float_e2k(sample_data, start_col, workspace); -+} -+ -+GLOBAL(int) -+jsimd_can_fdct_islow(void) -+{ -+ init_simd(); -+ -+ /* The code is optimised for these values only */ -+ if (DCTSIZE != 8) -+ return 0; -+ if (sizeof(DCTELEM) != 2) -+ return 0; -+ -+ if (simd_support & JSIMD_SSE2) -+ return 1; -+ -+ return 0; -+} -+ -+GLOBAL(int) -+jsimd_can_fdct_ifast(void) -+{ -+ init_simd(); -+ -+ /* The code is optimised for these values only */ -+ if (DCTSIZE != 8) -+ return 0; -+ if (sizeof(DCTELEM) != 2) -+ return 0; -+ -+ if (simd_support & JSIMD_SSE2) -+ return 1; -+ -+ return 0; -+} -+ -+GLOBAL(int) -+jsimd_can_fdct_float(void) -+{ -+ init_simd(); -+ -+ /* The code is optimised for these values only */ -+ if (DCTSIZE != 8) -+ return 0; -+ if (sizeof(FAST_FLOAT) != 4) -+ return 0; -+ -+ if (simd_support & JSIMD_SSE2) -+ return 1; -+ -+ return 0; -+} -+ -+GLOBAL(void) -+jsimd_fdct_islow(DCTELEM *data) -+{ -+ jsimd_fdct_islow_e2k(data); -+} -+ -+GLOBAL(void) -+jsimd_fdct_ifast(DCTELEM *data) -+{ -+ jsimd_fdct_ifast_e2k(data); -+} -+ -+GLOBAL(void) -+jsimd_fdct_float(FAST_FLOAT *data) -+{ -+ jsimd_fdct_float_e2k(data); -+} -+ -+GLOBAL(int) -+jsimd_can_quantize(void) -+{ -+ init_simd(); -+ -+ /* The code is optimised for these values only */ -+ if (DCTSIZE != 8) -+ return 0; -+ if (sizeof(JCOEF) != 2) -+ return 0; -+ if (sizeof(DCTELEM) != 2) -+ return 0; -+ -+ if (simd_support & JSIMD_SSE2) -+ return 1; -+ -+ return 0; -+} -+ -+GLOBAL(int) -+jsimd_can_quantize_float(void) -+{ -+ init_simd(); -+ -+ /* The code is optimised for these values only */ -+ if (DCTSIZE != 8) -+ return 0; -+ if (sizeof(JCOEF) != 2) -+ return 0; -+ if (sizeof(FAST_FLOAT) != 4) -+ return 0; -+ -+ if (simd_support & JSIMD_SSE2) -+ return 1; -+ -+ return 0; -+} -+ -+GLOBAL(void) -+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace) -+{ -+ jsimd_quantize_e2k(coef_block, divisors, workspace); -+} -+ -+GLOBAL(void) -+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors, -+ FAST_FLOAT *workspace) -+{ -+ jsimd_quantize_float_e2k(coef_block, divisors, workspace); -+} -+ -+GLOBAL(int) -+jsimd_can_idct_2x2(void) -+{ -+ return 0; -+} -+ -+GLOBAL(int) -+jsimd_can_idct_4x4(void) -+{ -+ return 0; -+} -+ -+GLOBAL(void) -+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr, -+ JCOEFPTR coef_block, JSAMPARRAY output_buf, -+ JDIMENSION output_col) -+{ -+} -+ -+GLOBAL(void) -+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr, -+ JCOEFPTR coef_block, JSAMPARRAY output_buf, -+ JDIMENSION output_col) -+{ -+} -+ -+GLOBAL(int) -+jsimd_can_idct_islow(void) -+{ -+ init_simd(); -+ -+ /* The code is optimised for these values only */ -+ if (DCTSIZE != 8) -+ return 0; -+ if (sizeof(JCOEF) != 2) -+ return 0; -+ if (BITS_IN_JSAMPLE != 8) -+ return 0; -+ if (sizeof(ISLOW_MULT_TYPE) != 2) -+ return 0; -+ -+ if (simd_support & JSIMD_SSE2) -+ return 1; -+ -+ return 0; -+} -+ -+GLOBAL(int) -+jsimd_can_idct_ifast(void) -+{ -+ init_simd(); -+ -+ /* The code is optimised for these values only */ -+ if (DCTSIZE != 8) -+ return 0; -+ if (sizeof(JCOEF) != 2) -+ return 0; -+ if (BITS_IN_JSAMPLE != 8) -+ return 0; -+ if (sizeof(IFAST_MULT_TYPE) != 2) -+ return 0; -+ if (IFAST_SCALE_BITS != 2) -+ return 0; -+ -+ if (simd_support & JSIMD_SSE2) -+ return 1; -+ -+ return 0; -+} -+ -+GLOBAL(int) -+jsimd_can_idct_float(void) -+{ -+ init_simd(); -+ -+ /* The code is optimised for these values only */ -+ if (DCTSIZE != 8) -+ return 0; -+ if (sizeof(JCOEF) != 2) -+ return 0; -+ if (BITS_IN_JSAMPLE != 8) -+ return 0; -+ if (sizeof(FAST_FLOAT) != 4) -+ return 0; -+ if (sizeof(FLOAT_MULT_TYPE) != 4) -+ return 0; -+ -+ if (simd_support & JSIMD_SSE2) -+ return 1; -+ -+ return 0; -+} -+ -+GLOBAL(void) -+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr, -+ JCOEFPTR coef_block, JSAMPARRAY output_buf, -+ JDIMENSION output_col) -+{ -+ jsimd_idct_islow_e2k(compptr->dct_table, coef_block, output_buf, -+ output_col); -+} -+ -+GLOBAL(void) -+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr, -+ JCOEFPTR coef_block, JSAMPARRAY output_buf, -+ JDIMENSION output_col) -+{ -+ jsimd_idct_ifast_e2k(compptr->dct_table, coef_block, output_buf, -+ output_col); -+} -+ -+GLOBAL(void) -+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr, -+ JCOEFPTR coef_block, JSAMPARRAY output_buf, -+ JDIMENSION output_col) -+{ -+ jsimd_idct_float_e2k(compptr->dct_table, coef_block, output_buf, -+ output_col); -+} -+ -+GLOBAL(int) -+jsimd_can_huff_encode_one_block(void) -+{ -+ init_simd(); -+ -+ if (DCTSIZE != 8) -+ return 0; -+ if (sizeof(JCOEF) != 2) -+ return 0; -+ -+ if ((simd_support & JSIMD_SSE2) && simd_huffman) -+ return 1; -+ -+ return 0; -+} -+ -+GLOBAL(JOCTET *) -+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block, -+ int last_dc_val, c_derived_tbl *dctbl, -+ c_derived_tbl *actbl) -+{ -+ return jsimd_huff_encode_one_block_e2k(state, buffer, block, last_dc_val, -+ dctbl, actbl); -+} -+ -+GLOBAL(int) -+jsimd_can_encode_mcu_AC_first_prepare(void) -+{ -+ init_simd(); -+ -+ if (DCTSIZE != 8) -+ return 0; -+ if (sizeof(JCOEF) != 2) -+ return 0; -+ if ((simd_support & JSIMD_SSE2) && simd_huffman) -+ return 1; -+ -+ return 0; -+} -+ -+GLOBAL(void) -+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block, -+ const int *jpeg_natural_order_start, int Sl, -+ int Al, UJCOEF *values, size_t *zerobits) -+{ -+ jsimd_encode_mcu_AC_first_prepare_e2k(block, jpeg_natural_order_start, -+ Sl, Al, (JCOEF*)values, zerobits); -+} -+ -+GLOBAL(int) -+jsimd_can_encode_mcu_AC_refine_prepare(void) -+{ -+ init_simd(); -+ -+ if (DCTSIZE != 8) -+ return 0; -+ if (sizeof(JCOEF) != 2) -+ return 0; -+ if ((simd_support & JSIMD_SSE2) && simd_huffman) -+ return 1; -+ -+ return 0; -+} -+ -+GLOBAL(int) -+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block, -+ const int *jpeg_natural_order_start, int Sl, -+ int Al, UJCOEF *absvalues, size_t *bits) -+{ -+ return jsimd_encode_mcu_AC_refine_prepare_e2k(block, -+ jpeg_natural_order_start, -+ Sl, Al, (JCOEF*)absvalues, bits); -+} -diff --git a/simd/e2k/jsimd_api_e2k.h b/simd/e2k/jsimd_api_e2k.h -new file mode 100644 -index 0000000..d857203 ---- /dev/null -+++ b/simd/e2k/jsimd_api_e2k.h -@@ -0,0 +1,94 @@ -+/* -+ * Elbrus optimizations for libjpeg-turbo -+ * -+ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd -+ * -+ * This software is provided 'as-is', without any express or implied -+ * warranty. In no event will the authors be held liable for any damages -+ * arising from the use of this software. -+ * -+ * Permission is granted to anyone to use this software for any purpose, -+ * including commercial applications, and to alter it and redistribute it -+ * freely, subject to the following restrictions: -+ * -+ * 1. The origin of this software must not be misrepresented; you must not -+ * claim that you wrote the original software. If you use this software -+ * in a product, an acknowledgment in the product documentation would be -+ * appreciated but is not required. -+ * 2. Altered source versions must be plainly marked as such, and must not be -+ * misrepresented as being the original software. -+ * 3. This notice may not be removed or altered from any source distribution. -+ */ -+ -+/* Function declarations */ -+ -+#define CONVERT_DECL(n) \ -+EXTERN(void) jsimd_rgb##n##_ycc_convert_e2k(JDIMENSION img_width, \ -+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf, \ -+ JDIMENSION output_row, int num_rows, int shuf_idx); \ -+EXTERN(void) jsimd_rgb##n##_gray_convert_e2k(JDIMENSION img_width, \ -+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf, \ -+ JDIMENSION output_row, int num_rows, int shuf_idx); \ -+EXTERN(void) jsimd_ycc_rgb##n##_convert_e2k(JDIMENSION out_width, \ -+ JSAMPIMAGE input_buf, JDIMENSION input_row, \ -+ JSAMPARRAY output_buf, int num_rows, int shuf_idx); \ -+EXTERN(void) jsimd_ycc_rgb##n##_convert_e2k(JDIMENSION out_width, \ -+ JSAMPIMAGE input_buf, JDIMENSION input_row, JSAMPARRAY output_buf, \ -+ int num_rows, int shuf_idx); \ -+EXTERN(void) jsimd_ycc_rgb##n##_merged_upsample_e2k(JDIMENSION out_width, \ -+ JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, \ -+ JDIMENSION in_row_group_ctr_y, JSAMPARRAY output_buf, int shuf_idx); \ -+ -+CONVERT_DECL(3) -+CONVERT_DECL(4) -+ -+EXTERN(void) jsimd_h2v1_downsample_e2k -+ (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor, -+ JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data); -+EXTERN(void) jsimd_h2v2_downsample_e2k -+ (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor, -+ JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data); -+ -+#define UPSAMPLE_DECL(name) \ -+EXTERN(void) jsimd_##name##_upsample_e2k \ -+ (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, \ -+ JSAMPARRAY *output_data_ptr); -+ -+UPSAMPLE_DECL(h2v1) -+UPSAMPLE_DECL(h2v2) -+UPSAMPLE_DECL(h2v1_fancy) -+UPSAMPLE_DECL(h2v2_fancy) -+ -+EXTERN(void) jsimd_convsamp_e2k -+ (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace); -+EXTERN(void) jsimd_convsamp_float_e2k -+ (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace); -+ -+EXTERN(void) jsimd_fdct_islow_e2k(DCTELEM *data); -+EXTERN(void) jsimd_fdct_ifast_e2k(DCTELEM *data); -+EXTERN(void) jsimd_fdct_float_e2k(FAST_FLOAT *data); -+EXTERN(void) jsimd_quantize_e2k -+ (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace); -+EXTERN(void) jsimd_quantize_float_e2k -+ (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace); -+EXTERN(void) jsimd_idct_islow_e2k -+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, -+ JDIMENSION output_col); -+EXTERN(void) jsimd_idct_ifast_e2k -+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, -+ JDIMENSION output_col); -+EXTERN(void) jsimd_idct_float_e2k -+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, -+ JDIMENSION output_col); -+ -+EXTERN(JOCTET *) jsimd_huff_encode_one_block_e2k -+ (void *state, JOCTET *buffer, JCOEFPTR block, -+ int last_dc_val, c_derived_tbl *dctbl, c_derived_tbl *actbl); -+ -+EXTERN(void) jsimd_encode_mcu_AC_first_prepare_e2k -+ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, -+ JCOEF *values, size_t *zerobits); -+ -+EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_e2k -+ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, -+ JCOEF *absvalues, size_t *bits); -diff --git a/simd/e2k/jsimd_e2k.h b/simd/e2k/jsimd_e2k.h -new file mode 100644 -index 0000000..15d6262 ---- /dev/null -+++ b/simd/e2k/jsimd_e2k.h -@@ -0,0 +1,207 @@ -+/* -+ * Elbrus optimizations for libjpeg-turbo -+ * -+ * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. -+ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd -+ * -+ * This software is provided 'as-is', without any express or implied -+ * warranty. In no event will the authors be held liable for any damages -+ * arising from the use of this software. -+ * -+ * Permission is granted to anyone to use this software for any purpose, -+ * including commercial applications, and to alter it and redistribute it -+ * freely, subject to the following restrictions: -+ * -+ * 1. The origin of this software must not be misrepresented; you must not -+ * claim that you wrote the original software. If you use this software -+ * in a product, an acknowledgment in the product documentation would be -+ * appreciated but is not required. -+ * 2. Altered source versions must be plainly marked as such, and must not be -+ * misrepresented as being the original software. -+ * 3. This notice may not be removed or altered from any source distribution. -+ */ -+ -+#define JPEG_INTERNALS -+#include "../../jinclude.h" -+#include "../../jpeglib.h" -+#include "../../jsimd.h" -+#include "../../jdct.h" -+#include "../../jsimddct.h" -+#include "../jsimd.h" -+#include "jsimd_api_e2k.h" -+#include -+#include /* SSE4.1 */ -+ -+ -+/* Common code */ -+ -+#define __4X2(a, b) a, b, a, b, a, b, a, b -+#define ALWAYS_INLINE __attribute__((__always_inline__)) inline -+ -+#ifdef __e2k__ -+#define PRAGMA_E2K _Pragma -+#define _mm_shuffle2_pi8(a, b, c) \ -+ ((__m64)__builtin_e2k_pshufb((uint64_t)(b), (uint64_t)(a), (uint64_t)(c))) -+#define _mm_shuffle2_epi8(a, b, c) \ -+ ((__m128i)__builtin_e2k_qppermb((__v2di)(b), (__v2di)(a), (__v2di)(c))) -+#define _mm_blendv_pi8(a, b, c) \ -+ ((__m64)__builtin_e2k_pmerge((uint64_t)(a), (uint64_t)(b), (uint64_t)(c))) -+#else -+#define PRAGMA_E2K(x) -+#define _mm_shuffle2_pi8(a, b, c) \ -+ _mm_movepi64_pi64(_mm_shuffle_epi8(_mm_unpacklo_epi64( \ -+ _mm_movpi64_epi64(a), _mm_movpi64_epi64(b)), _mm_movpi64_epi64(c))) -+#define _mm_shuffle2_epi8(a, b, c) \ -+ _mm_blendv_epi8(_mm_shuffle_epi8(a, c), _mm_shuffle_epi8(b, c), \ -+ _mm_slli_epi16(c, 3)) -+#define _mm_blendv_pi8(a, b, c) \ -+ _mm_movepi64_pi64(_mm_blendv_epi8(_mm_movpi64_epi64(a), \ -+ _mm_movpi64_epi64(b), _mm_movpi64_epi64(c))) -+ -+#define BITREV_ROUND(c, i) a = (a & c) << i | (a >> i & c); -+static ALWAYS_INLINE uint64_t __builtin_e2k_bitrevd(uint64_t a) { -+ BITREV_ROUND(0x5555555555555555ll, 1) -+ BITREV_ROUND(0x3333333333333333ll, 2) -+ BITREV_ROUND(0x0F0F0F0F0F0F0F0Fll, 4) -+ BITREV_ROUND(0x00FF00FF00FF00FFll, 8) -+ BITREV_ROUND(0x0000FFFF0000FFFFll, 16) -+ return a >> 32 | a << 32; -+} -+ -+static ALWAYS_INLINE uint64_t __builtin_e2k_insfd(uint64_t a, uint64_t b, uint64_t c) { -+ int n = b & 63; -+ a = a >> n | a << (64 - n); -+ return c ^ ((a ^ c) & (~0ll << (b >> 6 & 63))); -+} -+#endif -+ -+#if defined(__iset__) && __iset__ >= 5 -+static ALWAYS_INLINE __m128i _mm_packhi_epi32(__m128i a, __m128i b) { -+ __m128i index = _mm_setr_epi8( -+ 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31); -+ return _mm_shuffle2_epi8(a, b, index); -+} -+ -+#define VEC_ISZERO(a) !_mm_cvtsi128_si64(_mm_packs_epi16(a, a)) -+#else -+static ALWAYS_INLINE __m128i _mm_packhi_epi32(__m128i a, __m128i b) { -+ union { __m128i v; __m64 d[2]; } l = { a }, h = { b }, x; -+ __m64 index = _mm_setr_pi8(2, 3, 6, 7, 10, 11, 14, 15); -+ x.d[0] = _mm_shuffle2_pi8(l.d[0], l.d[1], index); -+ x.d[1] = _mm_shuffle2_pi8(h.d[0], h.d[1], index); -+ return x.v; -+} -+ -+static ALWAYS_INLINE uint64_t vec_isnonzero(__m128i a) { -+ __v2di x = (__v2di)a; -+ return x[0] | x[1]; -+} -+ -+#define VEC_ISZERO(a) !vec_isnonzero(a) -+#endif -+ -+#define VEC_ALIGNR8(a, b) _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b), _mm_castsi128_pd(a), 1)) -+ -+#define TRANSPOSE_FLOAT(a, b, c, d, e, f, g, h) \ -+ tmp0 = _mm_unpacklo_ps(a, b); \ -+ tmp1 = _mm_unpackhi_ps(a, b); \ -+ tmp2 = _mm_unpacklo_ps(c, d); \ -+ tmp3 = _mm_unpackhi_ps(c, d); \ -+ e = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); \ -+ f = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); \ -+ g = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); \ -+ h = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); -+ -+#define TRANSPOSE8(a, b) \ -+ b##0 = _mm_unpacklo_epi8(a##0, a##2); \ -+ b##1 = _mm_unpackhi_epi8(a##0, a##2); \ -+ b##2 = _mm_unpacklo_epi8(a##1, a##3); \ -+ b##3 = _mm_unpackhi_epi8(a##1, a##3); -+ -+#define TRANSPOSE16(a, b) \ -+ b##0 = _mm_unpacklo_epi16(a##0, a##2); \ -+ b##1 = _mm_unpackhi_epi16(a##0, a##2); \ -+ b##2 = _mm_unpacklo_epi16(a##1, a##3); \ -+ b##3 = _mm_unpackhi_epi16(a##1, a##3); \ -+ b##4 = _mm_unpacklo_epi16(a##4, a##6); \ -+ b##5 = _mm_unpackhi_epi16(a##4, a##6); \ -+ b##6 = _mm_unpacklo_epi16(a##5, a##7); \ -+ b##7 = _mm_unpackhi_epi16(a##5, a##7); -+ -+#define TRANSPOSE(a, b) \ -+ TRANSPOSE16(a, b) TRANSPOSE16(b, a) \ -+ b##0 = _mm_unpacklo_epi64(a##0, a##4); \ -+ b##1 = _mm_unpackhi_epi64(a##0, a##4); \ -+ b##2 = _mm_unpacklo_epi64(a##1, a##5); \ -+ b##3 = _mm_unpackhi_epi64(a##1, a##5); \ -+ b##4 = _mm_unpacklo_epi64(a##2, a##6); \ -+ b##5 = _mm_unpackhi_epi64(a##2, a##6); \ -+ b##6 = _mm_unpacklo_epi64(a##3, a##7); \ -+ b##7 = _mm_unpackhi_epi64(a##3, a##7); -+ -+#define IDCT_SAVE() { \ -+ __m128i pb_cj = _mm_set1_epi8((int8_t)CENTERJSAMPLE); \ -+ \ -+ row0 = _mm_xor_si128(_mm_packs_epi16(out0, out1), pb_cj); \ -+ row1 = _mm_xor_si128(_mm_packs_epi16(out2, out3), pb_cj); \ -+ row2 = _mm_xor_si128(_mm_packs_epi16(out4, out5), pb_cj); \ -+ row3 = _mm_xor_si128(_mm_packs_epi16(out6, out7), pb_cj); \ -+ \ -+ TRANSPOSE8(row, col) TRANSPOSE8(col, row) TRANSPOSE8(row, col) \ -+ \ -+ VEC_STL(output_buf[0] + output_col, col0); \ -+ VEC_STH(output_buf[1] + output_col, col0); \ -+ VEC_STL(output_buf[2] + output_col, col1); \ -+ VEC_STH(output_buf[3] + output_col, col1); \ -+ VEC_STL(output_buf[4] + output_col, col2); \ -+ VEC_STH(output_buf[5] + output_col, col2); \ -+ VEC_STL(output_buf[6] + output_col, col3); \ -+ VEC_STH(output_buf[7] + output_col, col3); \ -+} -+ -+#define IDCT_SPLAT8(col0) { \ -+ row3 = _mm_unpacklo_epi16(col0, col0); \ -+ row7 = _mm_unpackhi_epi16(col0, col0); \ -+ row1 = _mm_unpacklo_epi16(row3, row3); \ -+ row3 = _mm_unpackhi_epi16(row3, row3); \ -+ row5 = _mm_unpacklo_epi16(row7, row7); \ -+ row7 = _mm_unpackhi_epi16(row7, row7); \ -+ row0 = _mm_unpacklo_epi64(row1, row1); \ -+ row1 = _mm_unpackhi_epi64(row1, row1); \ -+ row2 = _mm_unpacklo_epi64(row3, row3); \ -+ row3 = _mm_unpackhi_epi64(row3, row3); \ -+ row4 = _mm_unpacklo_epi64(row5, row5); \ -+ row5 = _mm_unpackhi_epi64(row5, row5); \ -+ row6 = _mm_unpacklo_epi64(row7, row7); \ -+ row7 = _mm_unpackhi_epi64(row7, row7); \ -+} -+ -+#ifndef min -+#define min(a, b) ((a) < (b) ? (a) : (b)) -+#endif -+ -+#define VEC_LD(a) _mm_loadu_si128((const __m128i*)(a)) -+#define VEC_ST(a, b) _mm_storeu_si128((__m128i*)(a), b) -+#define VEC_LD8(a) _mm_loadl_epi64((const __m128i*)(a)) -+#define VEC_STL(a, b) _mm_storel_epi64((__m128i*)(a), b) -+#define VEC_STH(a, b) _mm_storeh_pd((double*)(a), _mm_castsi128_pd(b)); -+#define VEC_SPLAT(v, i) _mm_shuffle_epi8(v, _mm_set1_epi16((i) * 2 | ((i) * 2 + 1) << 8)) -+ -+#if !defined(__iset__) || __iset__ < 5 -+#define NEED_ALIGN8 -+#define ALIGN8_COMMON uint64_t src_shr; __m64 src_tmp0, src_tmp1; -+#define ALIGN8_VARS(src) __m64 *src##_ptr, src##_next, src##_index; -+#define ALIGN8_START(ptr, src) \ -+ src_shr = (intptr_t)(ptr - 1) & 7; \ -+ src##_ptr = (__m64*)((intptr_t)(ptr - 1) & -8); \ -+ src##_next = src##_ptr[src_shr == 7]; \ -+ src##_index = _mm_add_pi8(_mm_set1_pi8(src_shr), \ -+ _mm_setr_pi8(1, 2, 3, 4, 5, 6, 7, 8)); -+#define ALIGN8_READ16(v0, src, i) \ -+ src_tmp1 = src##_ptr[i * 2 + 1]; \ -+ src_tmp0 = _mm_shuffle2_pi8(src##_next, src_tmp1, src##_index); \ -+ src##_next = src##_ptr[i * 2 + 2]; \ -+ src_tmp1 = _mm_shuffle2_pi8(src_tmp1, src##_next, src##_index); \ -+ v0 = _mm_setr_epi64(src_tmp0, src_tmp1); -+#endif -+ --- -2.34.1 - diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec index 751c31c..ebcd6c1 100644 --- a/libjpeg-turbo.spec +++ b/libjpeg-turbo.spec @@ -12,12 +12,12 @@ Summary: A MMX/SSE2 accelerated library for manipulating JPEG image files Name: libjpeg-turbo Epoch: 1 -Version: 3.0.2 +Version: 3.0.3 Release: 1 License: wxWindows Library License Group: Graphics Url: http://www.libjpeg-turbo.org -Source0: https://sourceforge.net/projects/libjpeg-turbo/files/%{version}/%{name}-%{version}.tar.gz +Source0: https://github.com/libjpeg-turbo/libjpeg-turbo/archive/refs/tags/%{version}.tar.gz # These two allow automatic lossless rotation of JPEG images from a digital # camera which have orientation markings in the EXIF data. After rotation # the orientation markings are reset to avoid duplicate rotation when From e8720c9ebacda09eeb63d7c8b84cfa658ca4d029 Mon Sep 17 00:00:00 2001 From: Alexander Stefanov Date: Sat, 18 May 2024 13:05:28 +0000 Subject: [PATCH 29/32] fix e2k patch --- libjpeg-turbo-3.0.2-e2k.patch | 4840 +++++++++++++++++++++++++++++++++ 1 file changed, 4840 insertions(+) diff --git a/libjpeg-turbo-3.0.2-e2k.patch b/libjpeg-turbo-3.0.2-e2k.patch index 888624d..dd7670b 100644 --- a/libjpeg-turbo-3.0.2-e2k.patch +++ b/libjpeg-turbo-3.0.2-e2k.patch @@ -1,3 +1,55 @@ +From f922bcd2542146dfce9bf3a5e49892c69fe1b0a4 Mon Sep 17 00:00:00 2001 +From: Alexander Stefanov +Date: Sat, 18 May 2024 13:04:48 +0000 +Subject: [PATCH] add e2k port + +--- + CMakeLists.txt | 5 + + simd/CMakeLists.txt | 23 ++ + simd/e2k/jccolext-e2k.c | 213 +++++++++++ + simd/e2k/jccolor-e2k.c | 163 +++++++++ + simd/e2k/jchuff-e2k.c | 307 ++++++++++++++++ + simd/e2k/jcphuff-e2k.c | 145 ++++++++ + simd/e2k/jcsample-e2k.c | 203 +++++++++++ + simd/e2k/jcsample.h | 28 ++ + simd/e2k/jdcolext-e2k.c | 258 +++++++++++++ + simd/e2k/jdcolor-e2k.c | 289 +++++++++++++++ + simd/e2k/jdcoltab-e2k.c | 80 ++++ + simd/e2k/jdsample-e2k.c | 389 ++++++++++++++++++++ + simd/e2k/jfdctflt-e2k.c | 127 +++++++ + simd/e2k/jfdctfst-e2k.c | 145 ++++++++ + simd/e2k/jfdctint-e2k.c | 255 +++++++++++++ + simd/e2k/jidctflt-e2k.c | 215 +++++++++++ + simd/e2k/jidctfst-e2k.c | 187 ++++++++++ + simd/e2k/jidctint-e2k.c | 294 +++++++++++++++ + simd/e2k/jquantf-e2k.c | 121 +++++++ + simd/e2k/jquanti-e2k.c | 178 +++++++++ + simd/e2k/jsimd.c | 761 +++++++++++++++++++++++++++++++++++++++ + simd/e2k/jsimd_api_e2k.h | 94 +++++ + simd/e2k/jsimd_e2k.h | 207 +++++++++++ + 23 files changed, 4687 insertions(+) + create mode 100644 simd/e2k/jccolext-e2k.c + create mode 100644 simd/e2k/jccolor-e2k.c + create mode 100644 simd/e2k/jchuff-e2k.c + create mode 100644 simd/e2k/jcphuff-e2k.c + create mode 100644 simd/e2k/jcsample-e2k.c + create mode 100644 simd/e2k/jcsample.h + create mode 100644 simd/e2k/jdcolext-e2k.c + create mode 100644 simd/e2k/jdcolor-e2k.c + create mode 100644 simd/e2k/jdcoltab-e2k.c + create mode 100644 simd/e2k/jdsample-e2k.c + create mode 100644 simd/e2k/jfdctflt-e2k.c + create mode 100644 simd/e2k/jfdctfst-e2k.c + create mode 100644 simd/e2k/jfdctint-e2k.c + create mode 100644 simd/e2k/jidctflt-e2k.c + create mode 100644 simd/e2k/jidctfst-e2k.c + create mode 100644 simd/e2k/jidctint-e2k.c + create mode 100644 simd/e2k/jquantf-e2k.c + create mode 100644 simd/e2k/jquanti-e2k.c + create mode 100644 simd/e2k/jsimd.c + create mode 100644 simd/e2k/jsimd_api_e2k.h + create mode 100644 simd/e2k/jsimd_e2k.h + diff --git a/CMakeLists.txt b/CMakeLists.txt index ff9c9c2..fa0364c 100644 --- a/CMakeLists.txt @@ -55,3 +107,4791 @@ index 0237955..e61baea 100644 ############################################################################### # None +diff --git a/simd/e2k/jccolext-e2k.c b/simd/e2k/jccolext-e2k.c +new file mode 100644 +index 0000000..49abdb4 +--- /dev/null ++++ b/simd/e2k/jccolext-e2k.c +@@ -0,0 +1,213 @@ ++/* ++ * Elbrus optimizations for libjpeg-turbo ++ * ++ * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. ++ * Copyright (C) 2014, Jay Foad. All Rights Reserved. ++ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd ++ * ++ * This software is provided 'as-is', without any express or implied ++ * warranty. In no event will the authors be held liable for any damages ++ * arising from the use of this software. ++ * ++ * Permission is granted to anyone to use this software for any purpose, ++ * including commercial applications, and to alter it and redistribute it ++ * freely, subject to the following restrictions: ++ * ++ * 1. The origin of this software must not be misrepresented; you must not ++ * claim that you wrote the original software. If you use this software ++ * in a product, an acknowledgment in the product documentation would be ++ * appreciated but is not required. ++ * 2. Altered source versions must be plainly marked as such, and must not be ++ * misrepresented as being the original software. ++ * 3. This notice may not be removed or altered from any source distribution. ++ */ ++ ++/* This file is included by jccolor-e2k.c */ ++ ++void rgbn_ycc_convert(JDIMENSION img_width, JSAMPARRAY input_buf, ++ JSAMPIMAGE output_buf, JDIMENSION output_row, ++ int num_rows, int shuf_idx) ++{ ++ JSAMPROW inptr, outptr0, outptr1, outptr2; ++ unsigned char __attribute__((aligned(16))) tmpbuf[PIXELSIZE * 16]; ++ ++ __m128i pb_zero = _mm_setzero_si128(); ++ __m128i pb_shuf0 = VEC_LD(rgb_ycc_shuf_const[shuf_idx]); ++#if PIXELSIZE == 4 ++ __m128i rgb3 = pb_zero; ++#else ++ __m128i pb_shuf4 = VEC_LD(rgb_ycc_shuf_const[shuf_idx] + 16); ++#endif ++ __m128i rgb0, rgb1 = pb_zero, rgb2 = pb_zero, ++ rgbg0, rgbg1, rgbg2, rgbg3, rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3; ++ __m128i y, yl, yh, y0, y1, y2, y3; ++ __m128i cb, cr, crl, crh, cbl, cbh; ++ __m128i cr0, cr1, cr2, cr3, cb0, cb1, cb2, cb3; ++ ++ /* Constants */ ++ __m128i pw_f0299_f0337 = _mm_setr_epi16(__4X2(F_0_299, F_0_337)), ++ pw_f0114_f0250 = _mm_setr_epi16(__4X2(F_0_114, F_0_250)), ++ pw_mf016_mf033 = _mm_setr_epi16(__4X2(-F_0_168, -F_0_331)), ++ pw_mf008_mf041 = _mm_setr_epi16(__4X2(-F_0_081, -F_0_418)), ++ pw_mf050_f000 = _mm_setr_epi16(__4X2(-F_0_500, 0)), ++ pd_onehalf = _mm_set1_epi32(ONE_HALF), ++ pd_onehalfm1_cj = _mm_set1_epi32(ONE_HALF - 1 + (CENTERJSAMPLE << SCALEBITS)); ++#ifdef NEED_ALIGN8 ++ ALIGN8_COMMON ++ ALIGN8_VARS(src) ++#endif ++ ++ if (img_width > 0) ++ while (--num_rows >= 0) { ++ int num_cols; ++ inptr = *input_buf++; ++ outptr0 = output_buf[0][output_row]; ++ outptr1 = output_buf[1][output_row]; ++ outptr2 = output_buf[2][output_row]; ++ output_row++; ++ ++ if (img_width >= 16) { ++#ifdef NEED_ALIGN8 ++ ALIGN8_START(inptr, src) ++ inptr += (img_width & -16) * PIXELSIZE; ++#endif ++ ++ PRAGMA_E2K("ivdep") ++ for (num_cols = img_width; num_cols >= 16; num_cols -= 16, ++ outptr0 += 16, outptr1 += 16, outptr2 += 16) { ++#ifdef NEED_ALIGN8 ++ ALIGN8_READ16(rgb0, src, 0) ++ ALIGN8_READ16(rgb1, src, 1) ++ ALIGN8_READ16(rgb2, src, 2) ++#if PIXELSIZE == 4 ++ ALIGN8_READ16(rgb3, src, 3) ++#endif ++ src_ptr += PIXELSIZE * 2; ++#else ++ rgb0 = VEC_LD(inptr); ++ rgb1 = VEC_LD(inptr + 16); ++ rgb2 = VEC_LD(inptr + 32); ++#if PIXELSIZE == 4 ++ rgb3 = VEC_LD(inptr + 48); ++#endif ++ inptr += PIXELSIZE * 16; ++#endif ++ RGB_SHUFFLE ++ CALC_Y(outptr0) ++ CALC_CC(outptr1, outptr2) ++ } ++ } ++ ++ num_cols = img_width & 15; ++ if (num_cols) { ++ int i; ++ memcpy(tmpbuf, inptr, num_cols * PIXELSIZE); ++ rgb0 = VEC_LD(tmpbuf); ++ rgb1 = VEC_LD(tmpbuf + 16); ++ rgb2 = VEC_LD(tmpbuf + 32); ++#if PIXELSIZE == 4 ++ rgb3 = VEC_LD(tmpbuf + 48); ++#endif ++ RGB_SHUFFLE ++ CALC_Y(tmpbuf) ++ CALC_CC(tmpbuf + 16, tmpbuf + 32) ++ ++ for (i = 0; i < num_cols; i++) { ++ outptr0[i] = tmpbuf[i]; ++ outptr1[i] = tmpbuf[i + 16]; ++ outptr2[i] = tmpbuf[i + 32]; ++ } ++ } ++ } ++} ++ ++void rgbn_gray_convert(JDIMENSION img_width, JSAMPARRAY input_buf, ++ JSAMPIMAGE output_buf, JDIMENSION output_row, ++ int num_rows, int shuf_idx) ++{ ++ JSAMPROW inptr, outptr; ++ uint8_t __attribute__((aligned(16))) tmpbuf[PIXELSIZE * 16]; ++ ++ __m128i pb_zero = _mm_setzero_si128(); ++ __m128i pb_shuf0 = VEC_LD(rgb_ycc_shuf_const[shuf_idx]); ++#if PIXELSIZE == 4 ++ __m128i rgb3 = pb_zero; ++#else ++ __m128i pb_shuf4 = VEC_LD(rgb_ycc_shuf_const[shuf_idx] + 16); ++#endif ++ __m128i rgb0, rgb1 = pb_zero, rgb2 = pb_zero, ++ rgbg0, rgbg1, rgbg2, rgbg3, rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3; ++ __m128i y, yl, yh, y0, y1, y2, y3; ++ ++ /* Constants */ ++ __m128i pw_f0299_f0337 = _mm_setr_epi16(__4X2(F_0_299, F_0_337)), ++ pw_f0114_f0250 = _mm_setr_epi16(__4X2(F_0_114, F_0_250)), ++ pd_onehalf = _mm_set1_epi32(ONE_HALF); ++#ifdef NEED_ALIGN8 ++ ALIGN8_COMMON ++ ALIGN8_VARS(src) ++#endif ++ ++ if (img_width > 0) ++ while (--num_rows >= 0) { ++ int num_cols; ++ inptr = *input_buf++; ++ outptr = output_buf[0][output_row]; ++ output_row++; ++ ++ if (img_width >= 16) { ++#ifdef NEED_ALIGN8 ++ ALIGN8_START(inptr, src) ++ inptr += (img_width & -16) * PIXELSIZE; ++#endif ++ ++ PRAGMA_E2K("ivdep") ++ for (num_cols = img_width; num_cols >= 16; num_cols -= 16, ++ outptr += 16) { ++#ifdef NEED_ALIGN8 ++ ALIGN8_READ16(rgb0, src, 0) ++ ALIGN8_READ16(rgb1, src, 1) ++ ALIGN8_READ16(rgb2, src, 2) ++#if PIXELSIZE == 4 ++ ALIGN8_READ16(rgb3, src, 3) ++#endif ++ src_ptr += PIXELSIZE * 2; ++#else ++ rgb0 = VEC_LD(inptr); ++ rgb1 = VEC_LD(inptr + 16); ++ rgb2 = VEC_LD(inptr + 32); ++#if PIXELSIZE == 4 ++ rgb3 = VEC_LD(inptr + 48); ++#endif ++ inptr += PIXELSIZE * 16; ++#endif ++ RGB_SHUFFLE ++ CALC_Y(outptr) ++ } ++ } ++ ++ num_cols = img_width & 15; ++ if (num_cols) { ++ int i; ++ memcpy(tmpbuf, inptr, num_cols * PIXELSIZE); ++ rgb0 = VEC_LD(tmpbuf); ++ rgb1 = VEC_LD(tmpbuf + 16); ++ rgb2 = VEC_LD(tmpbuf + 32); ++#if PIXELSIZE == 4 ++ rgb3 = VEC_LD(tmpbuf + 48); ++#endif ++ RGB_SHUFFLE ++ CALC_Y(tmpbuf) ++ ++ for (i = 0; i < num_cols; i++) { ++ outptr[i] = tmpbuf[i]; ++ } ++ } ++ } ++} ++ ++#undef RGB_SHUFFLE ++#undef PIXELSIZE ++#undef rgbn_ycc_convert ++#undef rgbn_gray_convert ++ +diff --git a/simd/e2k/jccolor-e2k.c b/simd/e2k/jccolor-e2k.c +new file mode 100644 +index 0000000..0af2626 +--- /dev/null ++++ b/simd/e2k/jccolor-e2k.c +@@ -0,0 +1,163 @@ ++/* ++ * Elbrus optimizations for libjpeg-turbo ++ * ++ * Copyright (C) 2014, D. R. Commander. All Rights Reserved. ++ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd ++ * ++ * This software is provided 'as-is', without any express or implied ++ * warranty. In no event will the authors be held liable for any damages ++ * arising from the use of this software. ++ * ++ * Permission is granted to anyone to use this software for any purpose, ++ * including commercial applications, and to alter it and redistribute it ++ * freely, subject to the following restrictions: ++ * ++ * 1. The origin of this software must not be misrepresented; you must not ++ * claim that you wrote the original software. If you use this software ++ * in a product, an acknowledgment in the product documentation would be ++ * appreciated but is not required. ++ * 2. Altered source versions must be plainly marked as such, and must not be ++ * misrepresented as being the original software. ++ * 3. This notice may not be removed or altered from any source distribution. ++ */ ++ ++/* RGB --> YCC CONVERSION */ ++ ++#include "jsimd_e2k.h" ++ ++#define F_0_081 5329 /* FIX(0.08131) */ ++#define F_0_114 7471 /* FIX(0.11400) */ ++#define F_0_168 11059 /* FIX(0.16874) */ ++#define F_0_250 16384 /* FIX(0.25000) */ ++#define F_0_299 19595 /* FIX(0.29900) */ ++#define F_0_331 21709 /* FIX(0.33126) */ ++#define F_0_418 27439 /* FIX(0.41869) */ ++#define F_0_500 32768 /* FIX(0.50000) */ ++#define F_0_587 38470 /* FIX(0.58700) */ ++#define F_0_337 (F_0_587 - F_0_250) /* FIX(0.58700) - FIX(0.25000) */ ++#define F_0_413 (65536 - F_0_587) /* FIX(1.00000) - FIX(0.58700) */ ++ ++#define SCALEBITS 16 ++#define ONE_HALF (1 << (SCALEBITS - 1)) ++ ++#define RGBG_INDEX_(name, color, i, x) \ ++ name##_##color + i * name##_PIXELSIZE + x, \ ++ name##_GREEN + i * name##_PIXELSIZE + x ++#define RGBG_INDEX(name, x) \ ++ RGBG_INDEX_(name, RED, 0, x), RGBG_INDEX_(name, RED, 1, x), \ ++ RGBG_INDEX_(name, RED, 2, x), RGBG_INDEX_(name, RED, 3, x), \ ++ RGBG_INDEX_(name, BLUE, 0, x), RGBG_INDEX_(name, BLUE, 1, x), \ ++ RGBG_INDEX_(name, BLUE, 2, x), RGBG_INDEX_(name, BLUE, 3, x) ++ ++static const uint8_t __attribute__((aligned(16))) ++rgb_ycc_shuf_const[7][32] = { ++ { RGBG_INDEX(RGB, 0), RGBG_INDEX(RGB, 4) }, ++ { RGBG_INDEX(EXT_RGB, 0), RGBG_INDEX(EXT_RGB, 4) }, ++ { RGBG_INDEX(EXT_RGBX, 0), RGBG_INDEX(EXT_RGBX, 4) }, ++ { RGBG_INDEX(EXT_BGR, 0), RGBG_INDEX(EXT_BGR, 4) }, ++ { RGBG_INDEX(EXT_BGRX, 0), RGBG_INDEX(EXT_BGRX, 4) }, ++ { RGBG_INDEX(EXT_XBGR, 0), RGBG_INDEX(EXT_XBGR, 4) }, ++ { RGBG_INDEX(EXT_XRGB, 0), RGBG_INDEX(EXT_XRGB, 4) } ++}; ++ ++ /* rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3 ++ * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7 ++ * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb ++ * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf ++ * ++ * rg0 = R0 G0 R1 G1 R2 G2 R3 G3 ++ * bg0 = B0 G0 B1 G1 B2 G2 B3 G3 ++ * ... ++ */ ++ ++ /* (Original) ++ * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B ++ * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE ++ * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE ++ * ++ * (This implementation) ++ * Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G ++ * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE ++ * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE ++ */ ++ ++#define CALC_Y(outptr0) \ ++ rg0 = _mm_unpacklo_epi8(rgbg0, pb_zero); \ ++ bg0 = _mm_unpackhi_epi8(rgbg0, pb_zero); \ ++ rg1 = _mm_unpacklo_epi8(rgbg1, pb_zero); \ ++ bg1 = _mm_unpackhi_epi8(rgbg1, pb_zero); \ ++ rg2 = _mm_unpacklo_epi8(rgbg2, pb_zero); \ ++ bg2 = _mm_unpackhi_epi8(rgbg2, pb_zero); \ ++ rg3 = _mm_unpacklo_epi8(rgbg3, pb_zero); \ ++ bg3 = _mm_unpackhi_epi8(rgbg3, pb_zero); \ ++ \ ++ /* Calculate Y values */ \ ++ y0 = _mm_add_epi32(_mm_madd_epi16(rg0, pw_f0299_f0337), pd_onehalf); \ ++ y1 = _mm_add_epi32(_mm_madd_epi16(rg1, pw_f0299_f0337), pd_onehalf); \ ++ y2 = _mm_add_epi32(_mm_madd_epi16(rg2, pw_f0299_f0337), pd_onehalf); \ ++ y3 = _mm_add_epi32(_mm_madd_epi16(rg3, pw_f0299_f0337), pd_onehalf); \ ++ y0 = _mm_add_epi32(_mm_madd_epi16(bg0, pw_f0114_f0250), y0); \ ++ y1 = _mm_add_epi32(_mm_madd_epi16(bg1, pw_f0114_f0250), y1); \ ++ y2 = _mm_add_epi32(_mm_madd_epi16(bg2, pw_f0114_f0250), y2); \ ++ y3 = _mm_add_epi32(_mm_madd_epi16(bg3, pw_f0114_f0250), y3); \ ++ \ ++ yl = _mm_packhi_epi32(y0, y1); \ ++ yh = _mm_packhi_epi32(y2, y3); \ ++ y = _mm_packus_epi16(yl, yh); \ ++ VEC_ST(outptr0, y); ++ ++#define CALC_CC(outptr1, outptr2) \ ++ /* Calculate Cb values */ \ ++ cb0 = _mm_add_epi32(_mm_madd_epi16(rg0, pw_mf016_mf033), pd_onehalfm1_cj); \ ++ cb1 = _mm_add_epi32(_mm_madd_epi16(rg1, pw_mf016_mf033), pd_onehalfm1_cj); \ ++ cb2 = _mm_add_epi32(_mm_madd_epi16(rg2, pw_mf016_mf033), pd_onehalfm1_cj); \ ++ cb3 = _mm_add_epi32(_mm_madd_epi16(rg3, pw_mf016_mf033), pd_onehalfm1_cj); \ ++ cb0 = _mm_sub_epi32(cb0, _mm_madd_epi16(bg0, pw_mf050_f000)); \ ++ cb1 = _mm_sub_epi32(cb1, _mm_madd_epi16(bg1, pw_mf050_f000)); \ ++ cb2 = _mm_sub_epi32(cb2, _mm_madd_epi16(bg2, pw_mf050_f000)); \ ++ cb3 = _mm_sub_epi32(cb3, _mm_madd_epi16(bg3, pw_mf050_f000)); \ ++ \ ++ cbl = _mm_packhi_epi32(cb0, cb1); \ ++ cbh = _mm_packhi_epi32(cb2, cb3); \ ++ cb = _mm_packus_epi16(cbl, cbh); \ ++ VEC_ST(outptr1, cb); \ ++ \ ++ /* Calculate Cr values */ \ ++ cr0 = _mm_add_epi32(_mm_madd_epi16(bg0, pw_mf008_mf041), pd_onehalfm1_cj); \ ++ cr1 = _mm_add_epi32(_mm_madd_epi16(bg1, pw_mf008_mf041), pd_onehalfm1_cj); \ ++ cr2 = _mm_add_epi32(_mm_madd_epi16(bg2, pw_mf008_mf041), pd_onehalfm1_cj); \ ++ cr3 = _mm_add_epi32(_mm_madd_epi16(bg3, pw_mf008_mf041), pd_onehalfm1_cj); \ ++ cr0 = _mm_sub_epi32(cr0, _mm_madd_epi16(rg0, pw_mf050_f000)); \ ++ cr1 = _mm_sub_epi32(cr1, _mm_madd_epi16(rg1, pw_mf050_f000)); \ ++ cr2 = _mm_sub_epi32(cr2, _mm_madd_epi16(rg2, pw_mf050_f000)); \ ++ cr3 = _mm_sub_epi32(cr3, _mm_madd_epi16(rg3, pw_mf050_f000)); \ ++ \ ++ crl = _mm_packhi_epi32(cr0, cr1); \ ++ crh = _mm_packhi_epi32(cr2, cr3); \ ++ cr = _mm_packus_epi16(crl, crh); \ ++ VEC_ST(outptr2, cr); ++ ++ ++#define PIXELSIZE 3 ++#define RGB_SHUFFLE \ ++ rgbg0 = _mm_shuffle_epi8(rgb0, pb_shuf0); \ ++ rgbg1 = _mm_shuffle_epi8(VEC_ALIGNR8(rgb1, rgb0), pb_shuf4); \ ++ rgbg2 = _mm_shuffle_epi8(VEC_ALIGNR8(rgb2, rgb1), pb_shuf0); \ ++ rgbg3 = _mm_shuffle_epi8(rgb2, pb_shuf4); ++ ++#define rgbn_ycc_convert jsimd_rgb3_ycc_convert_e2k ++#define rgbn_gray_convert jsimd_rgb3_gray_convert_e2k ++#include "jccolext-e2k.c" ++ ++ ++#define PIXELSIZE 4 ++#define RGB_SHUFFLE \ ++ rgbg0 = _mm_shuffle_epi8(rgb0, pb_shuf0); \ ++ rgbg1 = _mm_shuffle_epi8(rgb1, pb_shuf0); \ ++ rgbg2 = _mm_shuffle_epi8(rgb2, pb_shuf0); \ ++ rgbg3 = _mm_shuffle_epi8(rgb3, pb_shuf0); ++ ++#define rgbn_ycc_convert jsimd_rgb4_ycc_convert_e2k ++#define rgbn_gray_convert jsimd_rgb4_gray_convert_e2k ++#include "jccolext-e2k.c" ++ +diff --git a/simd/e2k/jchuff-e2k.c b/simd/e2k/jchuff-e2k.c +new file mode 100644 +index 0000000..ec4329e +--- /dev/null ++++ b/simd/e2k/jchuff-e2k.c +@@ -0,0 +1,307 @@ ++/* ++ * Elbrus optimizations for libjpeg-turbo ++ * ++ * Copyright (C) 2022, Ilya Kurdyukov for BaseALT, Ltd ++ * ++ * This software is provided 'as-is', without any express or implied ++ * warranty. In no event will the authors be held liable for any damages ++ * arising from the use of this software. ++ * ++ * Permission is granted to anyone to use this software for any purpose, ++ * including commercial applications, and to alter it and redistribute it ++ * freely, subject to the following restrictions: ++ * ++ * 1. The origin of this software must not be misrepresented; you must not ++ * claim that you wrote the original software. If you use this software ++ * in a product, an acknowledgment in the product documentation would be ++ * appreciated but is not required. ++ * 2. Altered source versions must be plainly marked as such, and must not be ++ * misrepresented as being the original software. ++ * 3. This notice may not be removed or altered from any source distribution. ++ * ++ * NOTE: All referenced figures are from ++ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994. ++ */ ++ ++/* Encode a single block's worth of coefficients */ ++ ++#include "jsimd_e2k.h" ++ ++#if __SIZEOF_SIZE_T__ != 8 ++#error ++#endif ++ ++typedef unsigned long long bit_buf_type; ++#define BIT_BUF_SIZE 64 ++ ++typedef struct { ++ bit_buf_type put_buffer; /* current bit accumulation buffer */ ++ int free_bits; /* # of bits available in it */ ++ int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */ ++} savable_state; ++ ++typedef struct { ++ JOCTET *next_output_byte; /* => next byte to write in buffer */ ++ size_t free_in_buffer; /* # of byte spaces remaining in buffer */ ++ savable_state cur; /* Current bit buffer & DC state */ ++ j_compress_ptr cinfo; /* dump_buffer needs access to this */ ++ int simd; ++} working_state; ++ ++#define EMIT_BYTE(b) { \ ++ buffer[0] = (JOCTET)(b); \ ++ buffer[1] = 0; \ ++ buffer -= -2 + ((JOCTET)(b) < 0xFF); \ ++} ++ ++#define FLUSH() { \ ++ if (put_buffer & 0x8080808080808080 & ~(put_buffer + 0x0101010101010101)) { \ ++ EMIT_BYTE(put_buffer >> 56) \ ++ EMIT_BYTE(put_buffer >> 48) \ ++ EMIT_BYTE(put_buffer >> 40) \ ++ EMIT_BYTE(put_buffer >> 32) \ ++ EMIT_BYTE(put_buffer >> 24) \ ++ EMIT_BYTE(put_buffer >> 16) \ ++ EMIT_BYTE(put_buffer >> 8) \ ++ EMIT_BYTE(put_buffer ) \ ++ } else { \ ++ *(uint64_t*)buffer = __builtin_bswap64(put_buffer); \ ++ buffer += 8; \ ++ } \ ++} ++ ++#define PUT_AND_FLUSH(code, size) { \ ++ put_buffer = (put_buffer << (size + free_bits)) | (code >> -free_bits); \ ++ FLUSH() \ ++ free_bits += BIT_BUF_SIZE; \ ++ put_buffer = code; \ ++} ++ ++#define PUT_BITS(code, size) { \ ++ free_bits -= size; \ ++ if (free_bits < 0) \ ++ PUT_AND_FLUSH(code, size) \ ++ else \ ++ put_buffer = (put_buffer << size) | code; \ ++} ++ ++#define PUT_CODE(code, size) { \ ++ /* temp &= (((JLONG)1) << nbits) - 1; */ \ ++ /* temp |= code << nbits; */ \ ++ temp = __builtin_e2k_insfd(code, __builtin_e2k_insfd(nbits, 6 * 63 + 64, -nbits), temp); \ ++ nbits += size; \ ++ PUT_BITS(temp, nbits) \ ++} ++ ++#define KLOOP_PREPARE(mask, i) \ ++ t0 = _mm_cmpeq_epi8(_mm_packs_epi16(v0, v1), zero); \ ++ t1 = _mm_cmpeq_epi8(_mm_packs_epi16(v2, v3), zero); \ ++ mask = (uint32_t)(_mm_movemask_epi8(t0) | _mm_movemask_epi8(t1) << 16); \ ++ t0 = _mm_add_epi16(_mm_srai_epi16(v0, 15), v0); \ ++ t1 = _mm_add_epi16(_mm_srai_epi16(v1, 15), v1); \ ++ t2 = _mm_add_epi16(_mm_srai_epi16(v2, 15), v2); \ ++ t3 = _mm_add_epi16(_mm_srai_epi16(v3, 15), v3); \ ++ v0 = _mm_abs_epi16(v0); \ ++ v1 = _mm_abs_epi16(v1); \ ++ v2 = _mm_abs_epi16(v2); \ ++ v3 = _mm_abs_epi16(v3); \ ++ VEC_ST(block_nbits + i, v0); \ ++ VEC_ST(block_nbits + i + 8, v1); \ ++ VEC_ST(block_nbits + i + 16, v2); \ ++ VEC_ST(block_nbits + i + 24, v3); \ ++ VEC_ST(block_diff + i, t0); \ ++ VEC_ST(block_diff + i + 8, t1); \ ++ VEC_ST(block_diff + i + 16, t2); \ ++ VEC_ST(block_diff + i + 24, t3); ++ ++#define SHUF16X4(a, b, c, d) _mm_setr_pi8( \ ++ a * 2, a * 2 + 1, b * 2, b * 2 + 1, c * 2, c * 2 + 1, d * 2, d * 2 + 1) ++#define VEC_COMBINE(h0, h1) _mm_unpacklo_epi64( \ ++ _mm_movpi64_epi64(h0), _mm_movpi64_epi64(h1)) ++#define INSFI_M64(a, b, c, d) _mm_cvtsi64_m64(__builtin_e2k_insfd( \ ++ _mm_cvtm64_si64(a), (b & 63) | (d & 63) << 6, _mm_cvtm64_si64(c))) ++ ++GLOBAL(JOCTET *) ++jsimd_huff_encode_one_block_e2k(void *state, JOCTET *buffer, ++ JCOEFPTR block, int last_dc_val, ++ c_derived_tbl *dctbl, c_derived_tbl *actbl) { ++ uint64_t temp, nbits; ++ uint64_t i, r, code, size; ++ uint64_t code_0xf0 = actbl->ehufco[0xf0]; ++ uint64_t size_0xf0 = actbl->ehufsi[0xf0]; ++ ++ working_state *state_ptr = (working_state*)state; ++ bit_buf_type put_buffer = state_ptr->cur.put_buffer; ++ int64_t free_bits = state_ptr->cur.free_bits; ++ ++ __m128i zero = _mm_setzero_si128(); ++ __m128i v0, v1, v2, v3, t0, t1, t2, t3; ++ int64_t mask, mask1; ++ uint16_t __attribute__((aligned(16))) block_nbits[DCTSIZE2]; ++ int16_t __attribute__((aligned(16))) block_diff[DCTSIZE2]; ++ ++#if 1 /* faster this way */ ++ { ++ __m64 d0l, d0h, d1l, d1h, d2l, d2h, d3l, d3h; ++ __m64 d4l, d4h, d5l, d5h, d6l, d6h, d7l, d7h; ++ __m64 h0, h1, h2, h3, r0, r1, c1256 = SHUF16X4(1, 2, 5, 6); ++ ++ d0l = *(__m64*)(block + 8 * 0); d0h = *(__m64*)(block + 8 * 0 + 4); // 0 4 ++ d1l = *(__m64*)(block + 8 * 1); d1h = *(__m64*)(block + 8 * 1 + 4); // 8 12 ++ d2l = *(__m64*)(block + 8 * 2); d2h = *(__m64*)(block + 8 * 2 + 4); // 16 20 ++ d3l = *(__m64*)(block + 8 * 3); d3h = *(__m64*)(block + 8 * 3 + 4); // 24 28 ++ d4l = *(__m64*)(block + 8 * 4); d4h = *(__m64*)(block + 8 * 4 + 4); // 32 36 ++ d5l = *(__m64*)(block + 8 * 5); d5h = *(__m64*)(block + 8 * 5 + 4); // 40 44 ++ d6l = *(__m64*)(block + 8 * 6); d6h = *(__m64*)(block + 8 * 6 + 4); // 48 52 ++ d7l = *(__m64*)(block + 8 * 7); d7h = *(__m64*)(block + 8 * 7 + 4); // 56 60 ++ ++ // d0l[0] d0l[1] d1l[0] d2l[0] ++ // d1l[1] d0l[2] d0l[3] d1l[2] ++ h0 = _mm_unpacklo_pi16(d1l, d2l); ++ r0 = _mm_unpacklo_pi32(d0l, h0); ++ r1 = _mm_shuffle2_pi8(d1l, d0l, SHUF16X4(1, 6, 7, 2)); ++ r0 = _mm_sub_pi16(r0, _mm_cvtsi64_m64((uint16_t)last_dc_val)); ++ v0 = VEC_COMBINE(r0, r1); ++ ++ // d2l[1] d3l[0] d4l[0] d3l[1] ++ // d2l[2] d1l[3] d0h[0] d0h[1] ++ h0 = _mm_srli_si64(_mm_unpacklo_pi32(d2l, d4l), 16); ++ h2 = INSFI_M64(d1l, 0, d2l, 48); ++ r0 = _mm_unpacklo_pi16(h0, d3l); ++ r1 = _mm_alignr_pi8(d0h, h2, 4); ++ v1 = VEC_COMBINE(r0, r1); ++ ++ // d1h[0] d2l[3] d3l[2] d4l[1] ++ // d5l[0] d6l[0] d5l[1] d4l[2] ++ h0 = INSFI_M64(d2l, 32, d1h, 16); ++ h1 = INSFI_M64(d4l, -32, d3l, 48); ++ h2 = INSFI_M64(d4l, 16, d6l, 16); ++ r0 = INSFI_M64(h1, 0, h0, 32); ++ r1 = _mm_unpacklo_pi16(d5l, h2); ++ v2 = VEC_COMBINE(r0, r1); ++ ++ // d3l[3] d2h[0] d1h[1] d0h[2] ++ // d0h[3] d1h[2] d2h[1] d3h[0] ++ h0 = _mm_alignr_pi8(d2h, d3l, 6); ++ h1 = INSFI_M64(d0h, 0, d1h, 32); ++ h2 = _mm_unpackhi_pi32(d0h, d1h); ++ h3 = _mm_unpacklo_pi32(d2h, d3h); ++ r0 = INSFI_M64(h1, -16, h0, 32); ++ r1 = _mm_shuffle2_pi8(h2, h3, c1256); ++ v3 = VEC_COMBINE(r0, r1); ++ ++ KLOOP_PREPARE(mask, 0) ++ ++ // d4l[3] d5l[2] d6l[1] d7l[0] ++ // d7l[1] d6l[2] d5l[3] d4h[0] ++ h0 = _mm_unpackhi_pi32(d4l, d5l); ++ h1 = _mm_unpacklo_pi32(d6l, d7l); ++ h2 = INSFI_M64(d6l, 0, d7l, 32); ++ h2 = INSFI_M64(d5l, 0, h2, 48); ++ r0 = _mm_shuffle2_pi8(h0, h1, c1256); ++ r1 = _mm_alignr_pi8(d4h, h2, 2); ++ v0 = VEC_COMBINE(r0, r1); ++ ++ // d3h[1] d2h[2] d1h[3] d2h[3] ++ // d3h[2] d4h[1] d5h[0] d6l[3] ++ h0 = _mm_slli_si64(INSFI_M64(d1h, 16, d3h, 32), 16); ++ h2 = INSFI_M64(d4h, -32, d3h, 48); ++ h3 = INSFI_M64(d6l, 32, d5h, 16); ++ r0 = _mm_unpackhi_pi16(h0, d2h); ++ r1 = _mm_alignr_pi8(h3, h2, 4); ++ v1 = VEC_COMBINE(r0, r1); ++ ++ // d7l[2] d7l[3] d6h[0] d5h[1] ++ // d4h[2] d3h[3] d4h[3] d5h[2] ++ h0 = INSFI_M64(d5h, 0, d6h, 16); ++ h2 = _mm_slli_si64(_mm_unpackhi_pi32(d3h, d5h), 16); ++ r0 = _mm_alignr_pi8(h0, d7l, 4); ++ r1 = _mm_unpackhi_pi16(d4h, h2); ++ v2 = VEC_COMBINE(r0, r1); ++ ++ // d6h[1] d7h[0] d7h[1] d6h[2] ++ // d5h[3] d6h[3] d7h[2] d7h[3] ++ h0 = INSFI_M64(d6h, -16, d7h, 32); ++ h2 = _mm_unpackhi_pi16(d5h, d6h); ++ r0 = _mm_shuffle_pi16(h0, 0xd2); ++ r1 = _mm_unpackhi_pi32(h2, d7h); ++ v3 = VEC_COMBINE(r0, r1); ++ } ++#else ++ v0 = _mm_setr_epi16( ++ block[0] - last_dc_val, block[1], block[8], block[16], ++ block[9], block[2], block[3], block[10]); ++ v1 = _mm_setr_epi16( ++ block[17], block[24], block[32], block[25], ++ block[18], block[11], block[4], block[5]); ++ v2 = _mm_setr_epi16( ++ block[12], block[19], block[26], block[33], ++ block[40], block[48], block[41], block[34]); ++ v3 = _mm_setr_epi16( ++ block[27], block[20], block[13], block[6], ++ block[7], block[14], block[21], block[28]); ++ ++ KLOOP_PREPARE(mask, 0) ++ ++ v0 = _mm_setr_epi16( ++ block[35], block[42], block[49], block[56], ++ block[57], block[50], block[43], block[36]); ++ v1 = _mm_setr_epi16( ++ block[29], block[22], block[15], block[23], ++ block[30], block[37], block[44], block[51]); ++ v2 = _mm_setr_epi16( ++ block[58], block[59], block[52], block[45], ++ block[38], block[31], block[39], block[46]); ++ v3 = _mm_setr_epi16( ++ block[53], block[60], block[61], block[54], ++ block[47], block[55], block[62], block[63]); ++#endif ++ ++ KLOOP_PREPARE(mask1, 32) ++ mask |= mask1 << 32; ++ mask = ~mask; ++ ++ /* Encode the DC coefficient difference per section F.1.2.1 */ ++ ++ nbits = block_nbits[0]; ++ temp = block_diff[0]; ++ nbits = nbits ? 32 - __builtin_clz(nbits) : 0; ++ ++ /* Emit the Huffman-coded symbol for the number of bits */ ++ code = dctbl->ehufco[nbits]; ++ size = dctbl->ehufsi[nbits]; ++ PUT_CODE(code, size) ++ ++ /* Encode the AC coefficients per section F.1.2.2 */ ++ ++ /* e2k doesn't have a tzcnt instruction */ ++ mask = __builtin_e2k_bitrevd(mask) << 1; ++ ++ for (i = 1; mask; i++, mask <<= 1) { ++ r = __builtin_clzll(mask); ++ mask <<= r; ++ i += r; ++ /* if run length > 15, must emit special run-length-16 codes (0xF0) */ ++ while (r > 15) { ++ PUT_BITS(code_0xf0, size_0xf0) ++ r -= 16; ++ } ++ nbits = block_nbits[i]; ++ temp = block_diff[i]; ++ nbits = 32 - __builtin_clz(nbits); ++ /* Emit Huffman symbol for run length / number of bits */ ++ /* r = r << 4 | nbits; */ ++ r = __builtin_e2k_insfd(r, 4 * 63 + 64, nbits); ++ code = actbl->ehufco[r]; ++ size = actbl->ehufsi[r]; ++ PUT_CODE(code, size) ++ } ++ ++ if (i != 64) { ++ PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0]) ++ } ++ ++ state_ptr->cur.put_buffer = put_buffer; ++ state_ptr->cur.free_bits = free_bits; ++ return buffer; ++} +diff --git a/simd/e2k/jcphuff-e2k.c b/simd/e2k/jcphuff-e2k.c +new file mode 100644 +index 0000000..f69afeb +--- /dev/null ++++ b/simd/e2k/jcphuff-e2k.c +@@ -0,0 +1,145 @@ ++/* ++ * Elbrus optimizations for libjpeg-turbo ++ * ++ * Copyright (C) 2022, Ilya Kurdyukov for BaseALT, Ltd ++ * ++ * This software is provided 'as-is', without any express or implied ++ * warranty. In no event will the authors be held liable for any damages ++ * arising from the use of this software. ++ * ++ * Permission is granted to anyone to use this software for any purpose, ++ * including commercial applications, and to alter it and redistribute it ++ * freely, subject to the following restrictions: ++ * ++ * 1. The origin of this software must not be misrepresented; you must not ++ * claim that you wrote the original software. If you use this software ++ * in a product, an acknowledgment in the product documentation would be ++ * appreciated but is not required. ++ * 2. Altered source versions must be plainly marked as such, and must not be ++ * misrepresented as being the original software. ++ * 3. This notice may not be removed or altered from any source distribution. ++ */ ++ ++#include "jsimd_e2k.h" ++ ++#define X(i) coefs[i] = block[jpeg_natural_order_start[i]]; ++#define Y(i) coefs[i] = i < rem ? block[jpeg_natural_order_start[i]] : 0; ++ ++#define LOOP \ ++ for (i = 0; i < Sl >> 4; i++) { \ ++ X(0) X(1) X(2) X(3) X(4) X(5) X(6) X(7) \ ++ X(8) X(9) X(10) X(11) X(12) X(13) X(14) X(15) \ ++ BLOCK16 \ ++ jpeg_natural_order_start += 16; \ ++ } \ ++ rem = Sl & 15; \ ++ if (Sl & 8) { \ ++ X(0) X(1) X(2) X(3) X(4) X(5) X(6) X(7) \ ++ Y(8) Y(9) Y(10) Y(11) Y(12) Y(13) Y(14) \ ++ coefs[15] = 0; \ ++ BLOCK16 \ ++ } else if (rem > 0) { \ ++ Y(0) Y(1) Y(2) Y(3) Y(4) Y(5) Y(6) Y(7) \ ++ BLOCK8 \ ++ } ++ ++void jsimd_encode_mcu_AC_first_prepare_e2k ++ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, ++ JCOEF *values, size_t *zerobits) ++{ ++ JCOEF *diff = values + DCTSIZE2; ++ int16_t __attribute__((aligned(16))) coefs[16]; ++ __m128i v0, v1, v2, v3; ++ __m128i c0 = _mm_setzero_si128(), shr = _mm_cvtsi32_si128(Al); ++ int i, rem; ++ ++#define BLOCK16 \ ++ v0 = _mm_load_si128((__m128i*)coefs); \ ++ v1 = _mm_load_si128((__m128i*)coefs + 1); \ ++ v2 = _mm_srai_epi16(v0, 15); \ ++ v3 = _mm_srai_epi16(v1, 15); \ ++ v0 = _mm_sra_epi16(_mm_abs_epi16(v0), shr); \ ++ v1 = _mm_sra_epi16(_mm_abs_epi16(v1), shr); \ ++ v2 = _mm_xor_si128(v0, v2); \ ++ v3 = _mm_xor_si128(v1, v3); \ ++ _mm_store_si128((__m128i*)values, v0); \ ++ _mm_store_si128((__m128i*)values + 1, v1); \ ++ _mm_store_si128((__m128i*)diff, v2); \ ++ _mm_store_si128((__m128i*)diff + 1, v3); \ ++ v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, v1), c0); \ ++ ((uint16_t*)zerobits)[i] = ~_mm_movemask_epi8(v2); \ ++ values += 16; diff += 16; ++ ++#define BLOCK8 \ ++ v0 = _mm_load_si128((__m128i*)coefs); \ ++ v2 = _mm_srai_epi16(v0, 15); \ ++ v0 = _mm_sra_epi16(_mm_abs_epi16(v0), shr); \ ++ v2 = _mm_xor_si128(v0, v2); \ ++ _mm_store_si128((__m128i*)values, v0); \ ++ _mm_store_si128((__m128i*)diff, v2); \ ++ v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, c0), c0); \ ++ ((uint16_t*)zerobits)[i] = ~_mm_movemask_epi8(v2); \ ++ values += 8; diff += 8; ++ ++ ((uint64_t*)zerobits)[0] = 0; ++ LOOP ++#undef BLOCK16 ++#undef BLOCK8 ++ ++ for (i = (64 - Sl) >> 3; i; i--) { ++ _mm_store_si128((__m128i*)values, c0); ++ _mm_store_si128((__m128i*)diff, c0); ++ values += 8; diff += 8; ++ } ++} ++ ++int jsimd_encode_mcu_AC_refine_prepare_e2k ++ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, ++ JCOEF *absvalues, size_t *bits) ++{ ++ union { uint64_t q; uint16_t w[4]; } mask1 = { 0 }; ++ int16_t __attribute__((aligned(16))) coefs[16]; ++ __m128i v0, v1, v2, c1 = _mm_set1_epi8(1); ++ __m128i c0 = _mm_setzero_si128(), shr = _mm_cvtsi32_si128(Al); ++ int i, rem; ++ ++#define BLOCK16 \ ++ v0 = _mm_load_si128((__m128i*)coefs); \ ++ v1 = _mm_load_si128((__m128i*)coefs + 1); \ ++ v2 = _mm_packs_epi16(v0, v1); \ ++ ((uint16_t*)bits)[4 + i] = ~_mm_movemask_epi8(v2); \ ++ v0 = _mm_sra_epi16(_mm_abs_epi16(v0), shr); \ ++ v1 = _mm_sra_epi16(_mm_abs_epi16(v1), shr); \ ++ v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, v1), c0); \ ++ ((uint16_t*)bits)[i] = ~_mm_movemask_epi8(v2); \ ++ _mm_store_si128((__m128i*)absvalues, v0); \ ++ _mm_store_si128((__m128i*)absvalues + 1, v1); \ ++ v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, v1), c1); \ ++ mask1.w[i] = _mm_movemask_epi8(v2); \ ++ absvalues += 16; ++ ++#define BLOCK8 \ ++ v0 = _mm_load_si128((__m128i*)coefs); \ ++ v2 = _mm_packs_epi16(v0, c0); \ ++ ((uint16_t*)bits)[4 + i] = ~_mm_movemask_epi8(v2); \ ++ v0 = _mm_sra_epi16(_mm_abs_epi16(v0), shr); \ ++ v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, c0), c0); \ ++ ((uint16_t*)bits)[i] = ~_mm_movemask_epi8(v2); \ ++ _mm_store_si128((__m128i*)absvalues, v0); \ ++ v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, c0), c1); \ ++ mask1.w[i] = _mm_movemask_epi8(v2); \ ++ absvalues += 8; ++ ++ ((uint64_t*)bits)[0] = 0; /* zero */ ++ ((uint64_t*)bits)[1] = 0; /* sign */ ++ LOOP ++#undef BLOCK16 ++#undef BLOCK8 ++ ++ for (i = (64 - Sl) >> 3; i; i--) { ++ _mm_store_si128((__m128i*)absvalues, c0); ++ absvalues += 8; ++ } ++ ++ return 63 - __builtin_clzll(mask1.q | 1); ++} +diff --git a/simd/e2k/jcsample-e2k.c b/simd/e2k/jcsample-e2k.c +new file mode 100644 +index 0000000..cac8897 +--- /dev/null ++++ b/simd/e2k/jcsample-e2k.c +@@ -0,0 +1,203 @@ ++/* ++ * Elbrus optimizations for libjpeg-turbo ++ * ++ * Copyright (C) 2015, D. R. Commander. All Rights Reserved. ++ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd ++ * ++ * This software is provided 'as-is', without any express or implied ++ * warranty. In no event will the authors be held liable for any damages ++ * arising from the use of this software. ++ * ++ * Permission is granted to anyone to use this software for any purpose, ++ * including commercial applications, and to alter it and redistribute it ++ * freely, subject to the following restrictions: ++ * ++ * 1. The origin of this software must not be misrepresented; you must not ++ * claim that you wrote the original software. If you use this software ++ * in a product, an acknowledgment in the product documentation would be ++ * appreciated but is not required. ++ * 2. Altered source versions must be plainly marked as such, and must not be ++ * misrepresented as being the original software. ++ * 3. This notice may not be removed or altered from any source distribution. ++ */ ++ ++/* CHROMA DOWNSAMPLING */ ++ ++#include "jsimd_e2k.h" ++#include "jcsample.h" ++ ++void jsimd_h2v1_downsample_e2k(JDIMENSION image_width, ++ int max_v_samp_factor, ++ JDIMENSION v_samp_factor, ++ JDIMENSION width_in_blocks, ++ JSAMPARRAY input_data, ++ JSAMPARRAY output_data) ++{ ++ int outcol; ++ JDIMENSION output_cols = width_in_blocks * DCTSIZE, outrow; ++ JSAMPROW inptr, outptr; ++ ++ __m128i this0, next0, out; ++ __m128i this0e, this0o, next0e, next0o, outl, outh; ++ ++ /* Constants */ ++ __m128i pw_bias = _mm_set1_epi32(1 << 16), ++ even_mask = _mm_set1_epi16(255); ++#ifdef NEED_ALIGN8 ++ ALIGN8_COMMON ++ ALIGN8_VARS(src) ++#endif ++ ++ expand_right_edge(input_data, max_v_samp_factor, ++ image_width, output_cols * 2); ++ ++ if (output_cols > 0) ++ for (outrow = 0; outrow < v_samp_factor; outrow++) { ++ outptr = output_data[outrow]; ++ inptr = input_data[outrow]; ++ ++#ifdef NEED_ALIGN8 ++ ALIGN8_START(inptr, src) ++#endif ++ PRAGMA_E2K("ivdep") ++ for (outcol = output_cols; outcol > 8; ++ outcol -= 16, outptr += 16) { ++ ++#ifdef NEED_ALIGN8 ++ ALIGN8_READ16(this0, src, 0) ++ ALIGN8_READ16(next0, src, 1) ++ src_ptr += 4; ++#else ++ this0 = VEC_LD(inptr); ++ next0 = VEC_LD(inptr + 16); ++ inptr += 32; ++#endif ++ this0e = _mm_and_si128(this0, even_mask); ++ this0o = _mm_srli_epi16(this0, 8); ++ outl = _mm_add_epi16(this0e, this0o); ++ outl = _mm_srli_epi16(_mm_add_epi16(outl, pw_bias), 1); ++ next0e = _mm_and_si128(next0, even_mask); ++ next0o = _mm_srli_epi16(next0, 8); ++ outh = _mm_add_epi16(next0e, next0o); ++ outh = _mm_srli_epi16(_mm_add_epi16(outh, pw_bias), 1); ++ ++ out = _mm_packus_epi16(outl, outh); ++ VEC_ST(outptr, out); ++ } ++ if (outcol > 0) { ++#ifdef NEED_ALIGN8 ++ ALIGN8_READ16(this0, src, 0) ++#else ++ this0 = VEC_LD(inptr); ++#endif ++ this0e = _mm_and_si128(this0, even_mask); ++ this0o = _mm_srli_epi16(this0, 8); ++ outl = _mm_add_epi16(this0e, this0o); ++ outl = _mm_srli_epi16(_mm_add_epi16(outl, pw_bias), 1); ++ ++ out = _mm_packus_epi16(outl, outl); ++ VEC_STL(outptr, out); ++ } ++ } ++} ++ ++ ++void jsimd_h2v2_downsample_e2k(JDIMENSION image_width, int max_v_samp_factor, ++ JDIMENSION v_samp_factor, ++ JDIMENSION width_in_blocks, ++ JSAMPARRAY input_data, JSAMPARRAY output_data) ++{ ++ int outcol; ++ JDIMENSION output_cols = width_in_blocks * DCTSIZE, outrow; ++ JSAMPROW inptr0, inptr1, outptr; ++ ++ __m128i this0, next0, this1, next1, out; ++ __m128i this0e, this0o, next0e, next0o, this1e, this1o, ++ next1e, next1o, out0l, out0h, out1l, out1h, outl, outh; ++ ++ /* Constants */ ++ __m128i pw_bias = _mm_set1_epi32(1 | 2 << 16), ++ even_mask = _mm_set1_epi16(255); ++#ifdef NEED_ALIGN8 ++ ALIGN8_COMMON ++ ALIGN8_VARS(src0) ++ ALIGN8_VARS(src1) ++#endif ++ ++ expand_right_edge(input_data, max_v_samp_factor, ++ image_width, output_cols * 2); ++ ++ if (output_cols > 0) ++ for (outrow = 0; outrow < v_samp_factor; outrow++) { ++ inptr0 = input_data[outrow * 2]; ++ inptr1 = input_data[outrow * 2 + 1]; ++ outptr = output_data[outrow]; ++ ++#ifdef NEED_ALIGN8 ++ ALIGN8_START(inptr0, src0) ++ ALIGN8_START(inptr1, src1) ++#endif ++ PRAGMA_E2K("ivdep") ++ for (outcol = output_cols; outcol > 8; ++ outcol -= 16, outptr += 16) { ++ ++#ifdef NEED_ALIGN8 ++ ALIGN8_READ16(this0, src0, 0) src0_ptr += 2; ++ ALIGN8_READ16(this1, src1, 0) src1_ptr += 2; ++#else ++ this0 = VEC_LD(inptr0); inptr0 += 16; ++ this1 = VEC_LD(inptr1); inptr1 += 16; ++#endif ++ this0e = _mm_and_si128(this0, even_mask); ++ this1e = _mm_and_si128(this1, even_mask); ++ this0o = _mm_srli_epi16(this0, 8); ++ this1o = _mm_srli_epi16(this1, 8); ++ out0l = _mm_add_epi16(this0e, this0o); ++ out1l = _mm_add_epi16(this1e, this1o); ++ ++ outl = _mm_add_epi16(out0l, out1l); ++ outl = _mm_srli_epi16(_mm_add_epi16(outl, pw_bias), 2); ++ ++#ifdef NEED_ALIGN8 ++ ALIGN8_READ16(next0, src0, 0) src0_ptr += 2; ++ ALIGN8_READ16(next1, src1, 0) src1_ptr += 2; ++#else ++ next0 = VEC_LD(inptr0); inptr0 += 16; ++ next1 = VEC_LD(inptr1); inptr1 += 16; ++#endif ++ next0e = _mm_and_si128(next0, even_mask); ++ next1e = _mm_and_si128(next1, even_mask); ++ next0o = _mm_srli_epi16(next0, 8); ++ next1o = _mm_srli_epi16(next1, 8); ++ out0h = _mm_add_epi16(next0e, next0o); ++ out1h = _mm_add_epi16(next1e, next1o); ++ ++ outh = _mm_add_epi16(out0h, out1h); ++ outh = _mm_srli_epi16(_mm_add_epi16(outh, pw_bias), 2); ++ ++ out = _mm_packus_epi16(outl, outh); ++ VEC_ST(outptr, out); ++ } ++ if (outcol > 0) { ++#ifdef NEED_ALIGN8 ++ ALIGN8_READ16(this0, src0, 0) ++ ALIGN8_READ16(this1, src1, 0) ++#else ++ this0 = VEC_LD(inptr0); ++ this1 = VEC_LD(inptr1); ++#endif ++ this0e = _mm_and_si128(this0, even_mask); ++ this1e = _mm_and_si128(this1, even_mask); ++ this0o = _mm_srli_epi16(this0, 8); ++ this1o = _mm_srli_epi16(this1, 8); ++ out0l = _mm_add_epi16(this0e, this0o); ++ out1l = _mm_add_epi16(this1e, this1o); ++ ++ outl = _mm_add_epi16(out0l, out1l); ++ outl = _mm_srli_epi16(_mm_add_epi16(outl, pw_bias), 2); ++ ++ out = _mm_packus_epi16(outl, outl); ++ VEC_STL(outptr, out); ++ } ++ } ++} +diff --git a/simd/e2k/jcsample.h b/simd/e2k/jcsample.h +new file mode 100644 +index 0000000..2ac4816 +--- /dev/null ++++ b/simd/e2k/jcsample.h +@@ -0,0 +1,28 @@ ++/* ++ * jcsample.h ++ * ++ * This file was part of the Independent JPEG Group's software: ++ * Copyright (C) 1991-1996, Thomas G. Lane. ++ * For conditions of distribution and use, see the accompanying README.ijg ++ * file. ++ */ ++ ++LOCAL(void) ++expand_right_edge(JSAMPARRAY image_data, int num_rows, JDIMENSION input_cols, ++ JDIMENSION output_cols) ++{ ++ register JSAMPROW ptr; ++ register JSAMPLE pixval; ++ register int count; ++ int row; ++ int numcols = (int)(output_cols - input_cols); ++ ++ if (numcols > 0) { ++ for (row = 0; row < num_rows; row++) { ++ ptr = image_data[row] + input_cols; ++ pixval = ptr[-1]; /* don't need GETJSAMPLE() here */ ++ for (count = numcols; count > 0; count--) ++ *ptr++ = pixval; ++ } ++ } ++} +diff --git a/simd/e2k/jdcolext-e2k.c b/simd/e2k/jdcolext-e2k.c +new file mode 100644 +index 0000000..4f12aef +--- /dev/null ++++ b/simd/e2k/jdcolext-e2k.c +@@ -0,0 +1,258 @@ ++/* ++ * Elbrus optimizations for libjpeg-turbo ++ * ++ * Copyright (C) 2015, D. R. Commander. All Rights Reserved. ++ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd ++ * ++ * This software is provided 'as-is', without any express or implied ++ * warranty. In no event will the authors be held liable for any damages ++ * arising from the use of this software. ++ * ++ * Permission is granted to anyone to use this software for any purpose, ++ * including commercial applications, and to alter it and redistribute it ++ * freely, subject to the following restrictions: ++ * ++ * 1. The origin of this software must not be misrepresented; you must not ++ * claim that you wrote the original software. If you use this software ++ * in a product, an acknowledgment in the product documentation would be ++ * appreciated but is not required. ++ * 2. Altered source versions must be plainly marked as such, and must not be ++ * misrepresented as being the original software. ++ * 3. This notice may not be removed or altered from any source distribution. ++ */ ++ ++/* This file is included by jdcolor-e2k.c */ ++ ++void ycc_rgbn_convert(JDIMENSION out_width, JSAMPIMAGE input_buf, ++ JDIMENSION input_row, JSAMPARRAY output_buf, ++ int num_rows, int shuf_idx) ++{ ++ JSAMPROW outptr, inptr0, inptr1, inptr2; ++ uint8_t __attribute__((aligned(16))) tmpbuf[PIXELSIZE * 16]; ++ ++ __m128i rgb0, rgb1, rgb2, rgb3, y, cb, cr; ++ __m128i rg0, rg1, bx0, bx1, yl, yh, cbl, cbh, ++ crl, crh, rl, rh, gl, gh, bl, bh, g0w, g1w, g2w, g3w; ++ __m128i g0, g1, g2, g3; ++ ++ /* Constants ++ * NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17 ++ * high-order bits, not 16. ++ */ ++ __m128i pw_f0402 = _mm_set1_epi16(F_0_402 >> 1), ++ pw_mf0228 = _mm_set1_epi16(-F_0_228 >> 1), ++ pw_mf0344_f0285 = _mm_setr_epi16(__4X2(-F_0_344, F_0_285)), ++ pb_255 = _mm_set1_epi8(-1), ++ pw_cj = _mm_set1_epi16(CENTERJSAMPLE), ++ pd_onehalf = _mm_set1_epi32(ONE_HALF), ++ pb_zero = _mm_setzero_si128(); ++ RGB_SHUFFLE_INIT ++#ifdef NEED_ALIGN8 ++ ALIGN8_COMMON ++ ALIGN8_VARS(src0) ++ ALIGN8_VARS(src1) ++ ALIGN8_VARS(src2) ++#endif ++ ++ if (out_width > 0) ++ while (--num_rows >= 0) { ++ int num_cols; ++ inptr0 = input_buf[0][input_row]; ++ inptr1 = input_buf[1][input_row]; ++ inptr2 = input_buf[2][input_row]; ++ input_row++; ++ outptr = *output_buf++; ++ ++ if (out_width >= 16) { ++#ifdef NEED_ALIGN8 ++ ALIGN8_START(inptr0, src0) ++ ALIGN8_START(inptr1, src1) ++ ALIGN8_START(inptr2, src2) ++ inptr0 += out_width & -16; ++ inptr1 += out_width & -16; ++ inptr2 += out_width & -16; ++#endif ++ PRAGMA_E2K("ivdep") ++ for (num_cols = out_width; num_cols >= 16; ++ num_cols -= 16, outptr += PIXELSIZE * 16) { ++#ifdef NEED_ALIGN8 ++ ALIGN8_READ16(y, src0, 0) src0_ptr += 2; ++ ALIGN8_READ16(cb, src1, 0) src1_ptr += 2; ++ ALIGN8_READ16(cr, src2, 0) src2_ptr += 2; ++#else ++ y = VEC_LD(inptr0); inptr0 += 16; ++ cb = VEC_LD(inptr1); inptr1 += 16; ++ cr = VEC_LD(inptr2); inptr2 += 16; ++#endif ++ CALC_RGB ++ RGB_SHUFFLE ++ VEC_ST(outptr, rgb0); ++ VEC_ST(outptr + 16, rgb1); ++ VEC_ST(outptr + 32, rgb2); ++#if PIXELSIZE == 4 ++ VEC_ST(outptr + 48, rgb3); ++#endif ++ } ++ } ++ ++ num_cols = out_width & 15; ++ if (num_cols) { ++ int i; ++ for (i = 0; i < num_cols; i++) { ++ tmpbuf[i] = inptr0[i]; ++ tmpbuf[i + 16] = inptr1[i]; ++ tmpbuf[i + 32] = inptr2[i]; ++ } ++ y = VEC_LD(tmpbuf); ++ cb = VEC_LD(tmpbuf + 16); ++ cr = VEC_LD(tmpbuf + 32); ++ CALC_RGB ++ RGB_SHUFFLE ++ VEC_ST(tmpbuf, rgb0); ++ VEC_ST(tmpbuf + 16, rgb1); ++ VEC_ST(tmpbuf + 32, rgb2); ++#if PIXELSIZE == 4 ++ VEC_ST(tmpbuf + 48, rgb3); ++#endif ++ memcpy(outptr, tmpbuf, num_cols * PIXELSIZE); ++ } ++ } ++} ++ ++void ycc_rgbn_merged(JDIMENSION out_width, JSAMPIMAGE input_buf, ++ JDIMENSION in_row_group_ctr, ++ JDIMENSION in_row_group_ctr_y, ++ JSAMPARRAY output_buf, int shuf_idx) ++{ ++ JSAMPROW outptr, inptr0, inptr1, inptr2; ++ int num_cols; ++ uint8_t __attribute__((aligned(16))) tmpbuf[4 * 16]; ++ ++ __m128i rgb0, rgb1, rgb2, rgb3, y, cb, cr; ++ __m128i rg0, rg1, bx0, bx1, yl, yh, cbl, cbh, ++ crl, crh, r_yl, r_yh, g_yl, g_yh, b_yl, b_yh, g_y0w, g_y1w, g_y2w, g_y3w, ++ rl, rh, gl, gh, bl, bh; ++ __m128i g_y0, g_y1, g_y2, g_y3; ++ ++ /* Constants ++ * NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17 ++ * high-order bits, not 16. ++ */ ++ __m128i pw_f0402 = _mm_set1_epi16(F_0_402 >> 1), ++ pw_mf0228 = _mm_set1_epi16(-F_0_228 >> 1), ++ pw_mf0344_f0285 = _mm_setr_epi16(__4X2(-F_0_344, F_0_285)), ++ pb_255 = _mm_set1_epi8(-1), ++ pw_cj = _mm_set1_epi16(CENTERJSAMPLE), ++ pd_onehalf = _mm_set1_epi32(ONE_HALF), ++ pb_zero = _mm_setzero_si128(); ++ RGB_SHUFFLE_INIT ++#ifdef NEED_ALIGN8 ++ ALIGN8_COMMON ++ ALIGN8_VARS(src0) ++ ALIGN8_VARS(src1) ++ ALIGN8_VARS(src2) ++#endif ++ ++ inptr0 = input_buf[0][in_row_group_ctr_y]; ++ inptr1 = input_buf[1][in_row_group_ctr]; ++ inptr2 = input_buf[2][in_row_group_ctr]; ++ outptr = output_buf[0]; ++ ++ if (out_width >= 32) { ++#ifdef NEED_ALIGN8 ++ ALIGN8_START(inptr0, src0) ++ ALIGN8_START(inptr1, src1) ++ ALIGN8_START(inptr2, src2) ++ inptr0 += out_width & -32; ++ inptr1 += (out_width & -32) >> 1; ++ inptr2 += (out_width & -32) >> 1; ++#endif ++ PRAGMA_E2K("ivdep") ++ for (num_cols = out_width; num_cols >= 32; num_cols -= 32) { ++ ++#ifdef NEED_ALIGN8 ++ ALIGN8_READ16(cb, src1, 0) src1_ptr += 2; ++ ALIGN8_READ16(cr, src2, 0) src2_ptr += 2; ++#else ++ cb = VEC_LD(inptr1); inptr1 += 16; ++ cr = VEC_LD(inptr2); inptr2 += 16; ++#endif ++ CALC_MERGED1 ++ ++#ifdef NEED_ALIGN8 ++ ALIGN8_READ16(y, src0, 0) src0_ptr += 2; ++#else ++ y = VEC_LD(inptr0); inptr0 += 16; ++#endif ++ CALC_MERGED2(r_yl, g_yl, b_yl) ++ RGB_SHUFFLE ++ VEC_ST(outptr, rgb0); ++ VEC_ST(outptr + 16, rgb1); ++ VEC_ST(outptr + 32, rgb2); ++#if PIXELSIZE == 4 ++ VEC_ST(outptr + 48, rgb3); ++#endif ++ outptr += PIXELSIZE * 16; ++ ++#ifdef NEED_ALIGN8 ++ ALIGN8_READ16(y, src0, 0) src0_ptr += 2; ++#else ++ y = VEC_LD(inptr0); inptr0 += 16; ++#endif ++ CALC_MERGED2(r_yh, g_yh, b_yh) ++ RGB_SHUFFLE ++ VEC_ST(outptr, rgb0); ++ VEC_ST(outptr + 16, rgb1); ++ VEC_ST(outptr + 32, rgb2); ++#if PIXELSIZE == 4 ++ VEC_ST(outptr + 48, rgb3); ++#endif ++ outptr += PIXELSIZE * 16; ++ } ++ } ++ ++ num_cols = out_width & 31; ++ if (num_cols) { ++ int i; ++ for (i = 0; i < (num_cols + 1) >> 1; i++) { ++ tmpbuf[i] = inptr1[i]; ++ tmpbuf[i + 16] = inptr2[i]; ++ tmpbuf[i * 2 + 32] = inptr0[i * 2]; ++ tmpbuf[i * 2 + 32 + 1] = inptr0[i * 2 + 1]; ++ } ++ cb = VEC_LD(tmpbuf); ++ cr = VEC_LD(tmpbuf + 16); ++ CALC_MERGED1 ++ ++ y = VEC_LD(tmpbuf + 32); ++ CALC_MERGED2(r_yl, g_yl, b_yl) ++ RGB_SHUFFLE ++ if (num_cols >= 16) { ++ VEC_ST(outptr, rgb0); ++ VEC_ST(outptr + 16, rgb1); ++ VEC_ST(outptr + 32, rgb2); ++#if PIXELSIZE == 4 ++ VEC_ST(outptr + 48, rgb3); ++#endif ++ outptr += PIXELSIZE * 16; ++ ++ y = VEC_LD(tmpbuf + 48); ++ CALC_MERGED2(r_yh, g_yh, b_yh) ++ RGB_SHUFFLE ++ } ++ VEC_ST(tmpbuf, rgb0); ++ VEC_ST(tmpbuf + 16, rgb1); ++ VEC_ST(tmpbuf + 32, rgb2); ++#if PIXELSIZE == 4 ++ VEC_ST(tmpbuf + 48, rgb3); ++#endif ++ memcpy(outptr, tmpbuf, (out_width & 15) * PIXELSIZE); ++ } ++} ++ ++#undef RGB_SHUFFLE_INIT ++#undef RGB_SHUFFLE ++#undef PIXELSIZE ++#undef ycc_rgbn_convert ++#undef ycc_rgbn_merged ++ +diff --git a/simd/e2k/jdcolor-e2k.c b/simd/e2k/jdcolor-e2k.c +new file mode 100644 +index 0000000..94c80e9 +--- /dev/null ++++ b/simd/e2k/jdcolor-e2k.c +@@ -0,0 +1,289 @@ ++/* ++ * Elbrus optimizations for libjpeg-turbo ++ * ++ * Copyright (C) 2015, D. R. Commander. All Rights Reserved. ++ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd ++ * ++ * This software is provided 'as-is', without any express or implied ++ * warranty. In no event will the authors be held liable for any damages ++ * arising from the use of this software. ++ * ++ * Permission is granted to anyone to use this software for any purpose, ++ * including commercial applications, and to alter it and redistribute it ++ * freely, subject to the following restrictions: ++ * ++ * 1. The origin of this software must not be misrepresented; you must not ++ * claim that you wrote the original software. If you use this software ++ * in a product, an acknowledgment in the product documentation would be ++ * appreciated but is not required. ++ * 2. Altered source versions must be plainly marked as such, and must not be ++ * misrepresented as being the original software. ++ * 3. This notice may not be removed or altered from any source distribution. ++ */ ++ ++/* YCC --> RGB CONVERSION */ ++ ++#include "jsimd_e2k.h" ++ ++#define F_0_344 22554 /* FIX(0.34414) */ ++#define F_0_714 46802 /* FIX(0.71414) */ ++#define F_1_402 91881 /* FIX(1.40200) */ ++#define F_1_772 116130 /* FIX(1.77200) */ ++#define F_0_402 (F_1_402 - 65536) /* FIX(1.40200) - FIX(1) */ ++#define F_0_285 (65536 - F_0_714) /* FIX(1) - FIX(0.71414) */ ++#define F_0_228 (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */ ++ ++#define SCALEBITS 16 ++#define ONE_HALF (1 << (SCALEBITS - 1)) ++ ++static const uint8_t __attribute__((aligned(16))) ++#if defined(__iset__) && __iset__ >= 5 ++ycc_rgb_shuf_const[7][48] = { ++#define SHUF_CONST3 \ ++ C0, C1, C2, \ ++ C0 + 4, C1 + 4, C2 + 4, \ ++ C0 + 8, C1 + 8, C2 + 8, \ ++ C0 + 12, C1 + 12, C2 + 12, \ ++ C0 + 16, C1 + 16, C2 + 16, \ ++ C0 + 20, C1 + 4, C2 + 4, \ ++ C0 + 8, C1 + 8, C2 + 8, \ ++ C0 + 12, C1 + 12, C2 + 12, \ ++ C0 + 16, C1 + 16, C2 + 16, \ ++ C0 + 20, C1 + 20, C2 + 20, \ ++ C0 + 24, C1 + 24, C2 + 8, \ ++ C0 + 12, C1 + 12, C2 + 12, \ ++ C0 + 16, C1 + 16, C2 + 16, \ ++ C0 + 20, C1 + 20, C2 + 20, \ ++ C0 + 24, C1 + 24, C2 + 24, \ ++ C0 + 28, C1 + 28, C2 + 28 ++#else ++ycc_rgb_shuf_const[7][24] = { ++#define SHUF_CONST3 \ ++ C0, C1, C2, \ ++ C0 + 4, C1 + 4, C2 + 4, \ ++ C0 + 8, C1 + 8, C2, \ ++ C0 + 4, C1 + 4, C2 + 4, \ ++ C0 + 8, C1 + 8, C2 + 8, \ ++ C0 + 12, C1 + 4, C2 + 4, \ ++ C0 + 8, C1 + 8, C2 + 8, \ ++ C0 + 12, C1 + 12, C2 + 12 ++#endif ++ ++#define SHUF_CONST4 C0, C1, C2, C3, C0 + 4, C1 + 4, C2 + 4, C3 + 4, \ ++ C0 + 8, C1 + 8, C2 + 8, C3 + 8, C0 + 12, C1 + 12, C2 + 12, C3 + 12 ++ ++#define TMP_RED RGB_RED ++#define TMP_GREEN RGB_GREEN ++#define TMP_BLUE RGB_BLUE ++#define PIXELSIZE RGB_PIXELSIZE ++#include "jdcoltab-e2k.c" ++ , ++#define TMP_RED EXT_RGB_RED ++#define TMP_GREEN EXT_RGB_GREEN ++#define TMP_BLUE EXT_RGB_BLUE ++#define PIXELSIZE EXT_RGB_PIXELSIZE ++#include "jdcoltab-e2k.c" ++ , ++#define TMP_RED EXT_RGBX_RED ++#define TMP_GREEN EXT_RGBX_GREEN ++#define TMP_BLUE EXT_RGBX_BLUE ++#define PIXELSIZE EXT_RGBX_PIXELSIZE ++#include "jdcoltab-e2k.c" ++ , ++#define TMP_RED EXT_BGR_RED ++#define TMP_GREEN EXT_BGR_GREEN ++#define TMP_BLUE EXT_BGR_BLUE ++#define PIXELSIZE EXT_BGR_PIXELSIZE ++#include "jdcoltab-e2k.c" ++ , ++#define TMP_RED EXT_BGRX_RED ++#define TMP_GREEN EXT_BGRX_GREEN ++#define TMP_BLUE EXT_BGRX_BLUE ++#define PIXELSIZE EXT_BGRX_PIXELSIZE ++#include "jdcoltab-e2k.c" ++ , ++#define TMP_RED EXT_XBGR_RED ++#define TMP_GREEN EXT_XBGR_GREEN ++#define TMP_BLUE EXT_XBGR_BLUE ++#define PIXELSIZE EXT_XBGR_PIXELSIZE ++#include "jdcoltab-e2k.c" ++ , ++#define TMP_RED EXT_XRGB_RED ++#define TMP_GREEN EXT_XRGB_GREEN ++#define TMP_BLUE EXT_XRGB_BLUE ++#define PIXELSIZE EXT_XRGB_PIXELSIZE ++#include "jdcoltab-e2k.c" ++}; ++ ++ /* (Original) ++ * R = Y + 1.40200 * Cr ++ * G = Y - 0.34414 * Cb - 0.71414 * Cr ++ * B = Y + 1.77200 * Cb ++ * ++ * (This implementation) ++ * R = Y + 0.40200 * Cr + Cr ++ * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr ++ * B = Y - 0.22800 * Cb + Cb + Cb ++ */ ++ ++#define CALC_RGB \ ++ yl = _mm_unpacklo_epi8(y, pb_zero); \ ++ yh = _mm_unpackhi_epi8(y, pb_zero); \ ++ \ ++ cbl = _mm_unpacklo_epi8(cb, pb_zero); \ ++ cbh = _mm_unpackhi_epi8(cb, pb_zero); \ ++ cbl = _mm_sub_epi16(cbl, pw_cj); \ ++ cbh = _mm_sub_epi16(cbh, pw_cj); \ ++ \ ++ crl = _mm_unpacklo_epi8(cr, pb_zero); \ ++ crh = _mm_unpackhi_epi8(cr, pb_zero); \ ++ crl = _mm_sub_epi16(crl, pw_cj); \ ++ crh = _mm_sub_epi16(crh, pw_cj); \ ++ \ ++ bl = _mm_mulhrs_epi16(cbl, pw_mf0228); \ ++ bh = _mm_mulhrs_epi16(cbh, pw_mf0228); \ ++ bl = _mm_add_epi16(bl, _mm_add_epi16(cbl, cbl)); \ ++ bh = _mm_add_epi16(bh, _mm_add_epi16(cbh, cbh)); \ ++ bl = _mm_add_epi16(bl, yl); \ ++ bh = _mm_add_epi16(bh, yh); \ ++ \ ++ rl = _mm_mulhrs_epi16(crl, pw_f0402); \ ++ rh = _mm_mulhrs_epi16(crh, pw_f0402); \ ++ rl = _mm_add_epi16(rl, crl); \ ++ rh = _mm_add_epi16(rh, crh); \ ++ rl = _mm_add_epi16(rl, yl); \ ++ rh = _mm_add_epi16(rh, yh); \ ++ \ ++ g0w = _mm_unpacklo_epi16(cbl, crl); \ ++ g1w = _mm_unpackhi_epi16(cbl, crl); \ ++ g0 = _mm_add_epi32(_mm_madd_epi16(g0w, pw_mf0344_f0285), pd_onehalf); \ ++ g1 = _mm_add_epi32(_mm_madd_epi16(g1w, pw_mf0344_f0285), pd_onehalf); \ ++ g2w = _mm_unpacklo_epi16(cbh, crh); \ ++ g3w = _mm_unpackhi_epi16(cbh, crh); \ ++ g2 = _mm_add_epi32(_mm_madd_epi16(g2w, pw_mf0344_f0285), pd_onehalf); \ ++ g3 = _mm_add_epi32(_mm_madd_epi16(g3w, pw_mf0344_f0285), pd_onehalf); \ ++ \ ++ gl = _mm_packhi_epi32(g0, g1); \ ++ gh = _mm_packhi_epi32(g2, g3); \ ++ gl = _mm_sub_epi16(gl, crl); \ ++ gh = _mm_sub_epi16(gh, crh); \ ++ gl = _mm_add_epi16(gl, yl); \ ++ gh = _mm_add_epi16(gh, yh); \ ++ \ ++ rl = _mm_packus_epi16(rl, rh); \ ++ gl = _mm_packus_epi16(gl, gh); \ ++ bl = _mm_packus_epi16(bl, bh); \ ++ \ ++ rg0 = _mm_unpacklo_epi8(rl, gl); \ ++ rg1 = _mm_unpackhi_epi8(rl, gl); \ ++ bx0 = _mm_unpacklo_epi8(bl, pb_255); \ ++ bx1 = _mm_unpackhi_epi8(bl, pb_255); \ ++ \ ++ rgb0 = _mm_unpacklo_epi16(rg0, bx0); \ ++ rgb1 = _mm_unpackhi_epi16(rg0, bx0); \ ++ rgb2 = _mm_unpacklo_epi16(rg1, bx1); \ ++ rgb3 = _mm_unpackhi_epi16(rg1, bx1); ++ ++#define CALC_MERGED1 \ ++ cbl = _mm_unpacklo_epi8(cb, pb_zero); \ ++ cbh = _mm_unpackhi_epi8(cb, pb_zero); \ ++ cbl = _mm_sub_epi16(cbl, pw_cj); \ ++ cbh = _mm_sub_epi16(cbh, pw_cj); \ ++ \ ++ crl = _mm_unpacklo_epi8(cr, pb_zero); \ ++ crh = _mm_unpackhi_epi8(cr, pb_zero); \ ++ crl = _mm_sub_epi16(crl, pw_cj); \ ++ crh = _mm_sub_epi16(crh, pw_cj); \ ++ \ ++ b_yl = _mm_mulhrs_epi16(cbl, pw_mf0228); \ ++ b_yh = _mm_mulhrs_epi16(cbh, pw_mf0228); \ ++ b_yl = _mm_add_epi16(b_yl, _mm_add_epi16(cbl, cbl)); \ ++ b_yh = _mm_add_epi16(b_yh, _mm_add_epi16(cbh, cbh)); \ ++ \ ++ r_yl = _mm_mulhrs_epi16(crl, pw_f0402); \ ++ r_yh = _mm_mulhrs_epi16(crh, pw_f0402); \ ++ r_yl = _mm_add_epi16(r_yl, crl); \ ++ r_yh = _mm_add_epi16(r_yh, crh); \ ++ \ ++ g_y0w = _mm_unpacklo_epi16(cbl, crl); \ ++ g_y1w = _mm_unpackhi_epi16(cbl, crl); \ ++ g_y0 = _mm_add_epi32(_mm_madd_epi16(g_y0w, pw_mf0344_f0285), pd_onehalf); \ ++ g_y1 = _mm_add_epi32(_mm_madd_epi16(g_y1w, pw_mf0344_f0285), pd_onehalf); \ ++ g_y2w = _mm_unpacklo_epi16(cbh, crh); \ ++ g_y3w = _mm_unpackhi_epi16(cbh, crh); \ ++ g_y2 = _mm_add_epi32(_mm_madd_epi16(g_y2w, pw_mf0344_f0285), pd_onehalf); \ ++ g_y3 = _mm_add_epi32(_mm_madd_epi16(g_y3w, pw_mf0344_f0285), pd_onehalf); \ ++ \ ++ g_yl = _mm_packhi_epi32(g_y0, g_y1); \ ++ g_yh = _mm_packhi_epi32(g_y2, g_y3); \ ++ g_yl = _mm_sub_epi16(g_yl, crl); \ ++ g_yh = _mm_sub_epi16(g_yh, crh); ++ ++#define CALC_MERGED2(r_yl, g_yl, b_yl) \ ++ yl = _mm_unpacklo_epi8(y, pb_zero); \ ++ yh = _mm_unpackhi_epi8(y, pb_zero); \ ++ bl = _mm_add_epi16(_mm_unpacklo_epi16(b_yl, b_yl), yl); \ ++ bh = _mm_add_epi16(_mm_unpackhi_epi16(b_yl, b_yl), yh); \ ++ rl = _mm_add_epi16(_mm_unpacklo_epi16(r_yl, r_yl), yl); \ ++ rh = _mm_add_epi16(_mm_unpackhi_epi16(r_yl, r_yl), yh); \ ++ gl = _mm_add_epi16(_mm_unpacklo_epi16(g_yl, g_yl), yl); \ ++ gh = _mm_add_epi16(_mm_unpackhi_epi16(g_yl, g_yl), yh); \ ++ rl = _mm_packus_epi16(rl, rh); \ ++ gl = _mm_packus_epi16(gl, gh); \ ++ bl = _mm_packus_epi16(bl, bh); \ ++ \ ++ rg0 = _mm_unpacklo_epi8(rl, gl); \ ++ rg1 = _mm_unpackhi_epi8(rl, gl); \ ++ bx0 = _mm_unpacklo_epi8(bl, pb_255); \ ++ bx1 = _mm_unpackhi_epi8(bl, pb_255); \ ++ \ ++ rgb0 = _mm_unpacklo_epi16(rg0, bx0); \ ++ rgb1 = _mm_unpackhi_epi16(rg0, bx0); \ ++ rgb2 = _mm_unpacklo_epi16(rg1, bx1); \ ++ rgb3 = _mm_unpackhi_epi16(rg1, bx1); ++ ++#define PIXELSIZE 3 ++#if defined(__iset__) && __iset__ >= 5 ++#define RGB_SHUFFLE_INIT __m128i \ ++ rgb_index0 = VEC_LD(ycc_rgb_shuf_const[shuf_idx]), \ ++ rgb_index1 = VEC_LD(ycc_rgb_shuf_const[shuf_idx] + 16), \ ++ rgb_index2 = VEC_LD(ycc_rgb_shuf_const[shuf_idx] + 32); ++#define RGB_SHUFFLE \ ++ rgb0 = _mm_shuffle2_epi8(rgb0, rgb1, rgb_index0); \ ++ rgb1 = _mm_shuffle2_epi8(rgb1, rgb2, rgb_index1); \ ++ rgb2 = _mm_shuffle2_epi8(rgb2, rgb3, rgb_index2); ++#else ++#define RGB_SHUFFLE_INIT __m64 \ ++ rgb_index0 = *(__m64*)ycc_rgb_shuf_const[shuf_idx], \ ++ rgb_index1 = *(__m64*)(ycc_rgb_shuf_const[shuf_idx] + 8), \ ++ rgb_index2 = *(__m64*)(ycc_rgb_shuf_const[shuf_idx] + 16); ++#define RGB_SHUFFLE { \ ++ union { __m128i v; __m64 d[2]; } a = { rgb0 }, \ ++ b = { rgb1 }, c = { rgb2 }, d = { rgb3 }; \ ++ a.d[0] = _mm_shuffle2_pi8(a.d[0], a.d[1], rgb_index0); \ ++ a.d[1] = _mm_shuffle2_pi8(a.d[1], b.d[0], rgb_index1); \ ++ b.d[0] = _mm_shuffle2_pi8(b.d[0], b.d[1], rgb_index2); \ ++ b.d[1] = _mm_shuffle2_pi8(c.d[0], c.d[1], rgb_index0); \ ++ c.d[0] = _mm_shuffle2_pi8(c.d[1], d.d[0], rgb_index1); \ ++ c.d[1] = _mm_shuffle2_pi8(d.d[0], d.d[1], rgb_index2); \ ++ rgb0 = a.v; rgb1 = b.v; rgb2 = c.v; \ ++} ++#endif ++ ++#define ycc_rgbn_convert jsimd_ycc_rgb3_convert_e2k ++#define ycc_rgbn_merged jsimd_ycc_rgb3_merged_upsample_e2k ++#include "jdcolext-e2k.c" ++ ++#define PIXELSIZE 4 ++#define RGB_SHUFFLE_INIT __m128i \ ++ rgb_index0 = VEC_LD(ycc_rgb_shuf_const[shuf_idx]); ++#define RGB_SHUFFLE \ ++ rgb0 = _mm_shuffle_epi8(rgb0, rgb_index0); \ ++ rgb1 = _mm_shuffle_epi8(rgb1, rgb_index0); \ ++ rgb2 = _mm_shuffle_epi8(rgb2, rgb_index0); \ ++ rgb3 = _mm_shuffle_epi8(rgb3, rgb_index0); ++ ++#define ycc_rgbn_convert jsimd_ycc_rgb4_convert_e2k ++#define ycc_rgbn_merged jsimd_ycc_rgb4_merged_upsample_e2k ++#include "jdcolext-e2k.c" ++ +diff --git a/simd/e2k/jdcoltab-e2k.c b/simd/e2k/jdcoltab-e2k.c +new file mode 100644 +index 0000000..e19666d +--- /dev/null ++++ b/simd/e2k/jdcoltab-e2k.c +@@ -0,0 +1,80 @@ ++/* ++ * Elbrus optimizations for libjpeg-turbo ++ * ++ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd ++ * ++ * This software is provided 'as-is', without any express or implied ++ * warranty. In no event will the authors be held liable for any damages ++ * arising from the use of this software. ++ * ++ * Permission is granted to anyone to use this software for any purpose, ++ * including commercial applications, and to alter it and redistribute it ++ * freely, subject to the following restrictions: ++ * ++ * 1. The origin of this software must not be misrepresented; you must not ++ * claim that you wrote the original software. If you use this software ++ * in a product, an acknowledgment in the product documentation would be ++ * appreciated but is not required. ++ * 2. Altered source versions must be plainly marked as such, and must not be ++ * misrepresented as being the original software. ++ * 3. This notice may not be removed or altered from any source distribution. ++ */ ++ ++/* This file is included by jdcolor-e2k.c */ ++ ++#if TMP_RED == 0 ++#define C0 0 ++#elif TMP_GREEN == 0 ++#define C0 1 ++#elif TMP_BLUE == 0 ++#define C0 2 ++#else ++#define C0 3 ++#endif ++ ++#if TMP_RED == 1 ++#define C1 0 ++#elif TMP_GREEN == 1 ++#define C1 1 ++#elif TMP_BLUE == 1 ++#define C1 2 ++#else ++#define C1 3 ++#endif ++ ++#if TMP_RED == 2 ++#define C2 0 ++#elif TMP_GREEN == 2 ++#define C2 1 ++#elif TMP_BLUE == 2 ++#define C2 2 ++#else ++#define C2 3 ++#endif ++ ++#if TMP_RED == 3 ++#define C3 0 ++#elif TMP_GREEN == 3 ++#define C3 1 ++#elif TMP_BLUE == 3 ++#define C3 2 ++#else ++#define C3 3 ++#endif ++ ++#if PIXELSIZE == 3 ++{ SHUF_CONST3 } ++#else ++{ SHUF_CONST4 } ++#endif ++ ++#undef C0 ++#undef C1 ++#undef C2 ++#undef C3 ++ ++#undef TMP_RED ++#undef TMP_GREEN ++#undef TMP_BLUE ++#undef PIXELSIZE ++ +diff --git a/simd/e2k/jdsample-e2k.c b/simd/e2k/jdsample-e2k.c +new file mode 100644 +index 0000000..572b3af +--- /dev/null ++++ b/simd/e2k/jdsample-e2k.c +@@ -0,0 +1,389 @@ ++/* ++ * Elbrus optimizations for libjpeg-turbo ++ * ++ * Copyright (C) 2015, D. R. Commander. All Rights Reserved. ++ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd ++ * ++ * This software is provided 'as-is', without any express or implied ++ * warranty. In no event will the authors be held liable for any damages ++ * arising from the use of this software. ++ * ++ * Permission is granted to anyone to use this software for any purpose, ++ * including commercial applications, and to alter it and redistribute it ++ * freely, subject to the following restrictions: ++ * ++ * 1. The origin of this software must not be misrepresented; you must not ++ * claim that you wrote the original software. If you use this software ++ * in a product, an acknowledgment in the product documentation would be ++ * appreciated but is not required. ++ * 2. Altered source versions must be plainly marked as such, and must not be ++ * misrepresented as being the original software. ++ * 3. This notice may not be removed or altered from any source distribution. ++ */ ++ ++/* CHROMA UPSAMPLING */ ++ ++#include "jsimd_e2k.h" ++ ++ ++void jsimd_h2v1_fancy_upsample_e2k(int max_v_samp_factor, ++ JDIMENSION downsampled_width, ++ JSAMPARRAY input_data, ++ JSAMPARRAY *output_data_ptr) ++{ ++ JSAMPARRAY output_data = *output_data_ptr; ++ JSAMPROW inptr, outptr; ++ int inrow, incol; ++ ++ __m128i pb_zero = _mm_setzero_si128(); ++ __m128i this0, last0, p_last0, next0 = pb_zero, p_next0, out; ++ __m128i this0l, this0h, last0l, last0h, ++ next0l, next0h, outle, outhe, outlo, outho; ++#ifdef NEED_ALIGN8 ++ ALIGN8_COMMON ++ ALIGN8_VARS(src) ++#endif ++ ++ /* Constants */ ++ __m128i pw_three = _mm_set1_epi16(3), ++ next_index_lastcol = _mm_setr_epi8( ++ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15), ++ pw_one = _mm_set1_epi16(1), pw_two = _mm_set1_epi16(2); ++ ++ if (downsampled_width > 0) ++ for (inrow = 0; inrow < max_v_samp_factor; inrow++) { ++ inptr = input_data[inrow]; ++ outptr = output_data[inrow]; ++ ++ if (downsampled_width & 15) ++ inptr[downsampled_width] = inptr[downsampled_width - 1]; ++ ++#ifdef NEED_ALIGN8 ++ ALIGN8_START(inptr, src) ++ ALIGN8_READ16(this0, src, 0) ++#else ++ this0 = VEC_LD(inptr); ++#endif ++ last0 = _mm_bslli_si128(this0, 15); ++ ++ PRAGMA_E2K("ivdep") ++ for (incol = downsampled_width; incol > 0; ++ incol -= 16, outptr += 32) { ++ ++ p_last0 = _mm_alignr_epi8(this0, last0, 15); ++ last0 = this0; ++ ++ if (__builtin_expect(incol <= 16, 0)) ++ p_next0 = _mm_shuffle_epi8(this0, next_index_lastcol); ++ else { ++#ifdef NEED_ALIGN8 ++ ALIGN8_READ16(next0, src, 1) src_ptr += 2; ++#else ++ next0 = VEC_LD(inptr + 16); inptr += 16; ++#endif ++ p_next0 = _mm_alignr_epi8(next0, this0, 1); ++ } ++ ++ this0l = _mm_mullo_epi16(_mm_unpacklo_epi8(this0, pb_zero), pw_three); ++ last0l = _mm_unpacklo_epi8(p_last0, pb_zero); ++ next0l = _mm_unpacklo_epi8(p_next0, pb_zero); ++ last0l = _mm_add_epi16(last0l, pw_one); ++ next0l = _mm_add_epi16(next0l, pw_two); ++ ++ outle = _mm_add_epi16(this0l, last0l); ++ outlo = _mm_add_epi16(this0l, next0l); ++ outle = _mm_srli_epi16(outle, 2); ++ outlo = _mm_srli_epi16(outlo, 2); ++ ++ out = _mm_or_si128(outle, _mm_slli_epi16(outlo, 8)); ++ VEC_ST(outptr, out); ++ ++ if (__builtin_expect(incol <= 8, 0)) break; ++ ++ this0h = _mm_mullo_epi16(_mm_unpackhi_epi8(this0, pb_zero), pw_three); ++ last0h = _mm_unpackhi_epi8(p_last0, pb_zero); ++ next0h = _mm_unpackhi_epi8(p_next0, pb_zero); ++ last0h = _mm_add_epi16(last0h, pw_one); ++ next0h = _mm_add_epi16(next0h, pw_two); ++ ++ outhe = _mm_add_epi16(this0h, last0h); ++ outho = _mm_add_epi16(this0h, next0h); ++ outhe = _mm_srli_epi16(outhe, 2); ++ outho = _mm_srli_epi16(outho, 2); ++ ++ out = _mm_or_si128(outhe, _mm_slli_epi16(outho, 8)); ++ VEC_ST(outptr + 16, out); ++ ++ this0 = next0; ++ } ++ } ++} ++ ++ ++void jsimd_h2v2_fancy_upsample_e2k(int max_v_samp_factor, ++ JDIMENSION downsampled_width, ++ JSAMPARRAY input_data, ++ JSAMPARRAY *output_data_ptr) ++{ ++ JSAMPARRAY output_data = *output_data_ptr; ++ JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1; ++ int inrow, outrow, incol; ++#ifdef NEED_ALIGN8 ++ ALIGN8_COMMON ++ ALIGN8_VARS(src_1) ++ ALIGN8_VARS(src0) ++ ALIGN8_VARS(src1) ++#endif ++ ++ __m128i pb_zero = _mm_setzero_si128(); ++ __m128i this_1, this0, this1, out; ++ __m128i this_1l, this_1h, this0l, this0h, this1l, this1h, ++ lastcolsum_1h, lastcolsum1h, ++ p_lastcolsum_1l, p_lastcolsum_1h, p_lastcolsum1l, p_lastcolsum1h, ++ thiscolsum_1l, thiscolsum_1h, thiscolsum1l, thiscolsum1h, ++ nextcolsum_1l = pb_zero, nextcolsum_1h = pb_zero, ++ nextcolsum1l = pb_zero, nextcolsum1h = pb_zero, ++ p_nextcolsum_1l, p_nextcolsum_1h, p_nextcolsum1l, p_nextcolsum1h, ++ tmpl, tmph, outle, outhe, outlo, outho; ++ ++ /* Constants */ ++ __m128i pw_three = _mm_set1_epi16(3), ++ pw_seven = _mm_set1_epi16(7), pw_eight = _mm_set1_epi16(8), ++ next_index_lastcol = _mm_setr_epi8( ++ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14, 15); ++ ++ if (downsampled_width > 0) ++ for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) { ++ ++ inptr_1 = input_data[inrow - 1]; ++ inptr0 = input_data[inrow]; ++ inptr1 = input_data[inrow + 1]; ++ outptr0 = output_data[outrow++]; ++ outptr1 = output_data[outrow++]; ++ ++ if (downsampled_width & 15) { ++ inptr_1[downsampled_width] = inptr_1[downsampled_width - 1]; ++ inptr0[downsampled_width] = inptr0[downsampled_width - 1]; ++ inptr1[downsampled_width] = inptr1[downsampled_width - 1]; ++ } ++ ++#ifdef NEED_ALIGN8 ++ ALIGN8_START(inptr0, src0) ++ ALIGN8_START(inptr_1, src_1) ++ ALIGN8_START(inptr1, src1) ++ ALIGN8_READ16(this0, src0, 0) ++ ALIGN8_READ16(this_1, src_1, 0) ++ ALIGN8_READ16(this1, src1, 0) ++#else ++ this0 = VEC_LD(inptr0); ++ this_1 = VEC_LD(inptr_1); ++ this1 = VEC_LD(inptr1); ++#endif ++ ++ this0l = _mm_unpacklo_epi8(this0, pb_zero); ++ this0h = _mm_unpackhi_epi8(this0, pb_zero); ++ this0l = _mm_mullo_epi16(this0l, pw_three); ++ this0h = _mm_mullo_epi16(this0h, pw_three); ++ ++ this_1l = _mm_unpacklo_epi8(this_1, pb_zero); ++ this_1h = _mm_unpackhi_epi8(this_1, pb_zero); ++ thiscolsum_1l = _mm_add_epi16(this0l, this_1l); ++ thiscolsum_1h = _mm_add_epi16(this0h, this_1h); ++ lastcolsum_1h = _mm_bslli_si128(thiscolsum_1l, 14);; ++ ++ this1l = _mm_unpacklo_epi8(this1, pb_zero); ++ this1h = _mm_unpackhi_epi8(this1, pb_zero); ++ thiscolsum1l = _mm_add_epi16(this0l, this1l); ++ thiscolsum1h = _mm_add_epi16(this0h, this1h); ++ lastcolsum1h = _mm_bslli_si128(thiscolsum1l, 14); ++ ++ PRAGMA_E2K("ivdep") ++ for (incol = downsampled_width; incol > 0; ++ incol -= 16, outptr0 += 32, outptr1 += 32) { ++ ++ p_lastcolsum_1l = _mm_alignr_epi8(thiscolsum_1l, lastcolsum_1h, 14); ++ p_lastcolsum_1h = _mm_alignr_epi8(thiscolsum_1h, thiscolsum_1l, 14); ++ p_lastcolsum1l = _mm_alignr_epi8(thiscolsum1l, lastcolsum1h, 14); ++ p_lastcolsum1h = _mm_alignr_epi8(thiscolsum1h, thiscolsum1l, 14); ++ lastcolsum_1h = thiscolsum_1h; ++ lastcolsum1h = thiscolsum1h; ++ ++ if (__builtin_expect(incol <= 16, 0)) { ++ p_nextcolsum_1l = _mm_alignr_epi8(thiscolsum_1h, thiscolsum_1l, 2); ++ p_nextcolsum_1h = _mm_shuffle_epi8(thiscolsum_1h, next_index_lastcol); ++ p_nextcolsum1l = _mm_alignr_epi8(thiscolsum1h, thiscolsum1l, 2); ++ p_nextcolsum1h = _mm_shuffle_epi8(thiscolsum1h, next_index_lastcol); ++ } else { ++#ifdef NEED_ALIGN8 ++ ALIGN8_READ16(this0, src0, 1) src0_ptr += 2; ++ ALIGN8_READ16(this_1, src_1, 1) src_1_ptr += 2; ++ ALIGN8_READ16(this1, src1, 1) src1_ptr += 2; ++#else ++ this0 = VEC_LD(inptr0 + 16); inptr0 += 16; ++ this_1 = VEC_LD(inptr_1 + 16); inptr_1 += 16; ++ this1 = VEC_LD(inptr1 + 16); inptr1 += 16; ++#endif ++ this0l = _mm_unpacklo_epi8(this0, pb_zero); ++ this0h = _mm_unpackhi_epi8(this0, pb_zero); ++ this0l = _mm_mullo_epi16(this0l, pw_three); ++ this0h = _mm_mullo_epi16(this0h, pw_three); ++ ++ this_1l = _mm_unpacklo_epi8(this_1, pb_zero); ++ this_1h = _mm_unpackhi_epi8(this_1, pb_zero); ++ nextcolsum_1l = _mm_add_epi16(this0l, this_1l); ++ nextcolsum_1h = _mm_add_epi16(this0h, this_1h); ++ p_nextcolsum_1l = _mm_alignr_epi8(thiscolsum_1h, thiscolsum_1l, 2); ++ p_nextcolsum_1h = _mm_alignr_epi8(nextcolsum_1l, thiscolsum_1h, 2); ++ ++ this1l = _mm_unpacklo_epi8(this1, pb_zero); ++ this1h = _mm_unpackhi_epi8(this1, pb_zero); ++ nextcolsum1l = _mm_add_epi16(this0l, this1l); ++ nextcolsum1h = _mm_add_epi16(this0h, this1h); ++ p_nextcolsum1l = _mm_alignr_epi8(thiscolsum1h, thiscolsum1l, 2); ++ p_nextcolsum1h = _mm_alignr_epi8(nextcolsum1l, thiscolsum1h, 2); ++ } ++ ++ /* Process the upper row */ ++ ++ tmpl = _mm_mullo_epi16(thiscolsum_1l, pw_three); ++ outle = _mm_add_epi16(tmpl, p_lastcolsum_1l); ++ outle = _mm_add_epi16(outle, pw_eight); ++ outle = _mm_srli_epi16(outle, 4); ++ ++ outlo = _mm_add_epi16(tmpl, p_nextcolsum_1l); ++ outlo = _mm_add_epi16(outlo, pw_seven); ++ outlo = _mm_srli_epi16(outlo, 4); ++ ++ out = _mm_or_si128(outle, _mm_slli_epi16(outlo, 8)); ++ VEC_ST(outptr0, out); ++ ++ /* Process the lower row */ ++ ++ tmpl = _mm_mullo_epi16(thiscolsum1l, pw_three); ++ outle = _mm_add_epi16(tmpl, p_lastcolsum1l); ++ outle = _mm_add_epi16(outle, pw_eight); ++ outle = _mm_srli_epi16(outle, 4); ++ ++ outlo = _mm_add_epi16(tmpl, p_nextcolsum1l); ++ outlo = _mm_add_epi16(outlo, pw_seven); ++ outlo = _mm_srli_epi16(outlo, 4); ++ ++ out = _mm_or_si128(outle, _mm_slli_epi16(outlo, 8)); ++ VEC_ST(outptr1, out); ++ ++ if (__builtin_expect(incol <= 8, 0)) break; ++ ++ tmph = _mm_mullo_epi16(thiscolsum_1h, pw_three); ++ outhe = _mm_add_epi16(tmph, p_lastcolsum_1h); ++ outhe = _mm_add_epi16(outhe, pw_eight); ++ outhe = _mm_srli_epi16(outhe, 4); ++ ++ outho = _mm_add_epi16(tmph, p_nextcolsum_1h); ++ outho = _mm_add_epi16(outho, pw_seven); ++ outho = _mm_srli_epi16(outho, 4); ++ ++ out = _mm_or_si128(outhe, _mm_slli_epi16(outho, 8)); ++ VEC_ST(outptr0 + 16, out); ++ ++ tmph = _mm_mullo_epi16(thiscolsum1h, pw_three); ++ outhe = _mm_add_epi16(tmph, p_lastcolsum1h); ++ outhe = _mm_add_epi16(outhe, pw_eight); ++ outhe = _mm_srli_epi16(outhe, 4); ++ ++ outho = _mm_add_epi16(tmph, p_nextcolsum1h); ++ outho = _mm_add_epi16(outho, pw_seven); ++ outho = _mm_srli_epi16(outho, 4); ++ ++ out = _mm_or_si128(outhe, _mm_slli_epi16(outho, 8)); ++ VEC_ST(outptr1 + 16, out); ++ ++ thiscolsum_1l = nextcolsum_1l; thiscolsum_1h = nextcolsum_1h; ++ thiscolsum1l = nextcolsum1l; thiscolsum1h = nextcolsum1h; ++ } ++ } ++} ++ ++ ++/* These are rarely used (mainly just for decompressing YCCK images) */ ++ ++void jsimd_h2v1_upsample_e2k(int max_v_samp_factor, ++ JDIMENSION out_width, ++ JSAMPARRAY input_data, ++ JSAMPARRAY *output_data_ptr) ++{ ++ JSAMPARRAY output_data = *output_data_ptr; ++ JSAMPROW inptr, outptr; ++ int inrow, incol; ++ ++ __m128i in, inl, inh; ++#ifdef NEED_ALIGN8 ++ ALIGN8_COMMON ++ ALIGN8_VARS(src) ++#endif ++ ++ if (out_width > 0) ++ for (inrow = 0; inrow < max_v_samp_factor; inrow++) { ++ inptr = input_data[inrow]; ++ outptr = output_data[inrow]; ++#ifdef NEED_ALIGN8 ++ ALIGN8_START(inptr, src) ++#endif ++ PRAGMA_E2K("ivdep") ++ for (incol = out_width; incol > 0; ++ incol -= 32, outptr += 32) { ++#ifdef NEED_ALIGN8 ++ ALIGN8_READ16(in, src, 0) src_ptr += 2; ++#else ++ in = VEC_LD(inptr); inptr += 16; ++#endif ++ inl = _mm_unpacklo_epi8(in, in); ++ inh = _mm_unpackhi_epi8(in, in); ++ ++ VEC_ST(outptr, inl); ++ VEC_ST(outptr + 16, inh); ++ } ++ } ++} ++ ++ ++void jsimd_h2v2_upsample_e2k(int max_v_samp_factor, ++ JDIMENSION out_width, ++ JSAMPARRAY input_data, ++ JSAMPARRAY *output_data_ptr) ++{ ++ JSAMPARRAY output_data = *output_data_ptr; ++ JSAMPROW inptr, outptr0, outptr1; ++ int inrow, outrow, incol; ++ ++ __m128i in, inl, inh; ++#ifdef NEED_ALIGN8 ++ ALIGN8_COMMON ++ ALIGN8_VARS(src) ++#endif ++ ++ if (out_width > 0) ++ for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) { ++ inptr = input_data[inrow]; ++ outptr0 = output_data[outrow++]; ++ outptr1 = output_data[outrow++]; ++#ifdef NEED_ALIGN8 ++ ALIGN8_START(inptr, src) ++#endif ++ PRAGMA_E2K("ivdep") ++ for (incol = out_width; incol > 0; ++ incol -= 32, outptr0 += 32, outptr1 += 32) { ++#ifdef NEED_ALIGN8 ++ ALIGN8_READ16(in, src, 0) src_ptr += 2; ++#else ++ in = VEC_LD(inptr); inptr += 16; ++#endif ++ inl = _mm_unpacklo_epi8(in, in); ++ inh = _mm_unpackhi_epi8(in, in); ++ ++ VEC_ST(outptr0, inl); ++ VEC_ST(outptr1, inl); ++ VEC_ST(outptr0 + 16, inh); ++ VEC_ST(outptr1 + 16, inh); ++ } ++ } ++} +diff --git a/simd/e2k/jfdctflt-e2k.c b/simd/e2k/jfdctflt-e2k.c +new file mode 100644 +index 0000000..e3c4d94 +--- /dev/null ++++ b/simd/e2k/jfdctflt-e2k.c +@@ -0,0 +1,127 @@ ++/* ++ * Elbrus optimizations for libjpeg-turbo ++ * ++ * Copyright (C) 2014, D. R. Commander. All Rights Reserved. ++ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd ++ * ++ * This software is provided 'as-is', without any express or implied ++ * warranty. In no event will the authors be held liable for any damages ++ * arising from the use of this software. ++ * ++ * Permission is granted to anyone to use this software for any purpose, ++ * including commercial applications, and to alter it and redistribute it ++ * freely, subject to the following restrictions: ++ * ++ * 1. The origin of this software must not be misrepresented; you must not ++ * claim that you wrote the original software. If you use this software ++ * in a product, an acknowledgment in the product documentation would be ++ * appreciated but is not required. ++ * 2. Altered source versions must be plainly marked as such, and must not be ++ * misrepresented as being the original software. ++ * 3. This notice may not be removed or altered from any source distribution. ++ */ ++ ++/* FLOAT FORWARD DCT */ ++ ++#include "jsimd_e2k.h" ++ ++#define DO_FDCT(in, out) { \ ++ tmp0 = _mm_add_ps(in##0, in##7); \ ++ tmp7 = _mm_sub_ps(in##0, in##7); \ ++ tmp1 = _mm_add_ps(in##1, in##6); \ ++ tmp6 = _mm_sub_ps(in##1, in##6); \ ++ tmp2 = _mm_add_ps(in##2, in##5); \ ++ tmp5 = _mm_sub_ps(in##2, in##5); \ ++ tmp3 = _mm_add_ps(in##3, in##4); \ ++ tmp4 = _mm_sub_ps(in##3, in##4); \ ++ \ ++ /* Even part */ \ ++ \ ++ tmp10 = _mm_add_ps(tmp0, tmp3); \ ++ tmp13 = _mm_sub_ps(tmp0, tmp3); \ ++ tmp11 = _mm_add_ps(tmp1, tmp2); \ ++ tmp12 = _mm_sub_ps(tmp1, tmp2); \ ++ \ ++ out##0 = _mm_add_ps(tmp10, tmp11); \ ++ out##4 = _mm_sub_ps(tmp10, tmp11); \ ++ \ ++ z1 = _mm_mul_ps(_mm_add_ps(tmp12, tmp13), pd_f0707); \ ++ out##2 = _mm_add_ps(tmp13, z1); \ ++ out##6 = _mm_sub_ps(tmp13, z1); \ ++ \ ++ /* Odd part */ \ ++ \ ++ tmp10 = _mm_add_ps(tmp4, tmp5); \ ++ tmp11 = _mm_add_ps(tmp5, tmp6); \ ++ tmp12 = _mm_add_ps(tmp6, tmp7); \ ++ \ ++ z5 = _mm_mul_ps(_mm_sub_ps(tmp10, tmp12), pd_f0382); \ ++ z2 = _mm_add_ps(_mm_mul_ps(tmp10, pd_f0541), z5); \ ++ z4 = _mm_add_ps(_mm_mul_ps(tmp12, pd_f1306), z5); \ ++ z3 = _mm_mul_ps(tmp11, pd_f0707); \ ++ \ ++ z11 = _mm_add_ps(tmp7, z3); \ ++ z13 = _mm_sub_ps(tmp7, z3); \ ++ \ ++ out##5 = _mm_add_ps(z13, z2); \ ++ out##3 = _mm_sub_ps(z13, z2); \ ++ out##1 = _mm_add_ps(z11, z4); \ ++ out##7 = _mm_sub_ps(z11, z4); \ ++} ++ ++#define LOAD_DATA(a, b, c, d, l, i) \ ++ l##a = _mm_loadu_ps(data + a * 8 + i); \ ++ l##b = _mm_loadu_ps(data + b * 8 + i); \ ++ l##c = _mm_loadu_ps(data + c * 8 + i); \ ++ l##d = _mm_loadu_ps(data + d * 8 + i); ++ ++#define STORE_DATA(a, b, c, d, l, i) \ ++ _mm_storeu_ps(data + a * 8 + i, l##a); \ ++ _mm_storeu_ps(data + b * 8 + i, l##b); \ ++ _mm_storeu_ps(data + c * 8 + i, l##c); \ ++ _mm_storeu_ps(data + d * 8 + i, l##d); ++ ++ ++void jsimd_fdct_float_e2k(FAST_FLOAT *data) ++{ ++ __m128 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, ++ tmp10, tmp11, tmp12, tmp13, z1, z2, z3, z4, z5, z11, z13; ++ __m128 l0, l1, l2, l3, l4, l5, l6, l7; ++ __m128 h0, h1, h2, h3, h4, h5, h6, h7; ++ __m128 x0, x1, x2, x3, x4, x5, x6, x7; ++ __m128 y0, y1, y2, y3, y4, y5, y6, y7; ++ ++ /* Constants */ ++ __m128 pd_f0382 = _mm_set1_ps(0.382683433f), ++ pd_f0541 = _mm_set1_ps(0.541196100f), ++ pd_f0707 = _mm_set1_ps(0.707106781f), ++ pd_f1306 = _mm_set1_ps(1.306562965f); ++ ++ /* Pass 1: process columns */ ++ ++ LOAD_DATA(0, 1, 2, 3, x, 0) ++ LOAD_DATA(0, 1, 2, 3, y, 4) ++ TRANSPOSE_FLOAT(x0, x1, x2, x3, l0, l1, l2, l3) ++ TRANSPOSE_FLOAT(y0, y1, y2, y3, l4, l5, l6, l7) ++ DO_FDCT(l, l); ++ ++ LOAD_DATA(4, 5, 6, 7, x, 0) ++ LOAD_DATA(4, 5, 6, 7, y, 4) ++ TRANSPOSE_FLOAT(x4, x5, x6, x7, h0, h1, h2, h3) ++ TRANSPOSE_FLOAT(y4, y5, y6, y7, h4, h5, h6, h7) ++ DO_FDCT(h, h); ++ ++ /* Pass 2: process rows */ ++ ++ TRANSPOSE_FLOAT(l0, l1, l2, l3, x0, x1, x2, x3) ++ TRANSPOSE_FLOAT(h0, h1, h2, h3, x4, x5, x6, x7) ++ DO_FDCT(x, x); ++ STORE_DATA(0, 1, 2, 3, x, 0) ++ STORE_DATA(4, 5, 6, 7, x, 0) ++ ++ TRANSPOSE_FLOAT(l4, l5, l6, l7, y0, y1, y2, y3) ++ TRANSPOSE_FLOAT(h4, h5, h6, h7, y4, y5, y6, y7) ++ DO_FDCT(y, y); ++ STORE_DATA(0, 1, 2, 3, y, 4) ++ STORE_DATA(4, 5, 6, 7, y, 4) ++} +diff --git a/simd/e2k/jfdctfst-e2k.c b/simd/e2k/jfdctfst-e2k.c +new file mode 100644 +index 0000000..9e58f05 +--- /dev/null ++++ b/simd/e2k/jfdctfst-e2k.c +@@ -0,0 +1,145 @@ ++/* ++ * Elbrus optimizations for libjpeg-turbo ++ * ++ * Copyright (C) 2014, D. R. Commander. All Rights Reserved. ++ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd ++ * ++ * This software is provided 'as-is', without any express or implied ++ * warranty. In no event will the authors be held liable for any damages ++ * arising from the use of this software. ++ * ++ * Permission is granted to anyone to use this software for any purpose, ++ * including commercial applications, and to alter it and redistribute it ++ * freely, subject to the following restrictions: ++ * ++ * 1. The origin of this software must not be misrepresented; you must not ++ * claim that you wrote the original software. If you use this software ++ * in a product, an acknowledgment in the product documentation would be ++ * appreciated but is not required. ++ * 2. Altered source versions must be plainly marked as such, and must not be ++ * misrepresented as being the original software. ++ * 3. This notice may not be removed or altered from any source distribution. ++ */ ++ ++/* FAST INTEGER FORWARD DCT */ ++ ++#include "jsimd_e2k.h" ++ ++ ++#define F_0_382 98 /* FIX(0.382683433) */ ++#define F_0_541 139 /* FIX(0.541196100) */ ++#define F_0_707 181 /* FIX(0.707106781) */ ++#define F_1_306 334 /* FIX(1.306562965) */ ++ ++#define CONST_BITS 8 ++#define PRE_MULTIPLY_SCALE_BITS 2 ++#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) ++ ++ ++#define DO_FDCT() { \ ++ /* Even part */ \ ++ \ ++ tmp10 = _mm_add_epi16(tmp0, tmp3); \ ++ tmp13 = _mm_sub_epi16(tmp0, tmp3); \ ++ tmp11 = _mm_add_epi16(tmp1, tmp2); \ ++ tmp12 = _mm_sub_epi16(tmp1, tmp2); \ ++ \ ++ out0 = _mm_add_epi16(tmp10, tmp11); \ ++ out4 = _mm_sub_epi16(tmp10, tmp11); \ ++ \ ++ z1 = _mm_add_epi16(tmp12, tmp13); \ ++ z1 = _mm_slli_epi16(z1, PRE_MULTIPLY_SCALE_BITS); \ ++ z1 = _mm_mulhi_epi16(z1, pw_0707); \ ++ \ ++ out2 = _mm_add_epi16(tmp13, z1); \ ++ out6 = _mm_sub_epi16(tmp13, z1); \ ++ \ ++ /* Odd part */ \ ++ \ ++ tmp10 = _mm_add_epi16(tmp4, tmp5); \ ++ tmp11 = _mm_add_epi16(tmp5, tmp6); \ ++ tmp12 = _mm_add_epi16(tmp6, tmp7); \ ++ \ ++ tmp10 = _mm_slli_epi16(tmp10, PRE_MULTIPLY_SCALE_BITS); \ ++ tmp12 = _mm_slli_epi16(tmp12, PRE_MULTIPLY_SCALE_BITS); \ ++ z5 = _mm_sub_epi16(tmp10, tmp12); \ ++ z5 = _mm_mulhi_epi16(z5, pw_0382); \ ++ \ ++ z2 = _mm_add_epi16(_mm_mulhi_epi16(tmp10, pw_0541), z5); \ ++ z4 = _mm_add_epi16(_mm_mulhi_epi16(tmp12, pw_1306), z5); \ ++ \ ++ tmp11 = _mm_slli_epi16(tmp11, PRE_MULTIPLY_SCALE_BITS); \ ++ z3 = _mm_mulhi_epi16(tmp11, pw_0707); \ ++ \ ++ z11 = _mm_add_epi16(tmp7, z3); \ ++ z13 = _mm_sub_epi16(tmp7, z3); \ ++ \ ++ out5 = _mm_add_epi16(z13, z2); \ ++ out3 = _mm_sub_epi16(z13, z2); \ ++ out1 = _mm_add_epi16(z11, z4); \ ++ out7 = _mm_sub_epi16(z11, z4); \ ++} ++ ++ ++void jsimd_fdct_ifast_e2k(DCTELEM *data) ++{ ++ __m128i row0, row1, row2, row3, row4, row5, row6, row7, ++ col0, col1, col2, col3, col4, col5, col6, col7, ++ tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13, ++ z1, z2, z3, z4, z5, z11, z13, ++ out0, out1, out2, out3, out4, out5, out6, out7; ++ ++ /* Constants */ ++ __m128i pw_0382 = _mm_set1_epi16(F_0_382 << CONST_SHIFT), ++ pw_0541 = _mm_set1_epi16(F_0_541 << CONST_SHIFT), ++ pw_0707 = _mm_set1_epi16(F_0_707 << CONST_SHIFT), ++ pw_1306 = _mm_set1_epi16(F_1_306 << CONST_SHIFT); ++ ++ /* Pass 1: process rows */ ++ ++ row0 = VEC_LD(data + 0 * 8); ++ row1 = VEC_LD(data + 1 * 8); ++ row2 = VEC_LD(data + 2 * 8); ++ row3 = VEC_LD(data + 3 * 8); ++ row4 = VEC_LD(data + 4 * 8); ++ row5 = VEC_LD(data + 5 * 8); ++ row6 = VEC_LD(data + 6 * 8); ++ row7 = VEC_LD(data + 7 * 8); ++ ++ TRANSPOSE(row, col); ++ ++ tmp0 = _mm_add_epi16(col0, col7); ++ tmp7 = _mm_sub_epi16(col0, col7); ++ tmp1 = _mm_add_epi16(col1, col6); ++ tmp6 = _mm_sub_epi16(col1, col6); ++ tmp2 = _mm_add_epi16(col2, col5); ++ tmp5 = _mm_sub_epi16(col2, col5); ++ tmp3 = _mm_add_epi16(col3, col4); ++ tmp4 = _mm_sub_epi16(col3, col4); ++ ++ DO_FDCT(); ++ ++ /* Pass 2: process columns */ ++ ++ TRANSPOSE(out, row); ++ ++ tmp0 = _mm_add_epi16(row0, row7); ++ tmp7 = _mm_sub_epi16(row0, row7); ++ tmp1 = _mm_add_epi16(row1, row6); ++ tmp6 = _mm_sub_epi16(row1, row6); ++ tmp2 = _mm_add_epi16(row2, row5); ++ tmp5 = _mm_sub_epi16(row2, row5); ++ tmp3 = _mm_add_epi16(row3, row4); ++ tmp4 = _mm_sub_epi16(row3, row4); ++ ++ DO_FDCT(); ++ ++ VEC_ST(data + 0 * 8, out0); ++ VEC_ST(data + 1 * 8, out1); ++ VEC_ST(data + 2 * 8, out2); ++ VEC_ST(data + 3 * 8, out3); ++ VEC_ST(data + 4 * 8, out4); ++ VEC_ST(data + 5 * 8, out5); ++ VEC_ST(data + 6 * 8, out6); ++ VEC_ST(data + 7 * 8, out7); ++} +diff --git a/simd/e2k/jfdctint-e2k.c b/simd/e2k/jfdctint-e2k.c +new file mode 100644 +index 0000000..2200852 +--- /dev/null ++++ b/simd/e2k/jfdctint-e2k.c +@@ -0,0 +1,255 @@ ++/* ++ * Elbrus optimizations for libjpeg-turbo ++ * ++ * Copyright (C) 2014, 2020, D. R. Commander. All Rights Reserved. ++ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd ++ * ++ * This software is provided 'as-is', without any express or implied ++ * warranty. In no event will the authors be held liable for any damages ++ * arising from the use of this software. ++ * ++ * Permission is granted to anyone to use this software for any purpose, ++ * including commercial applications, and to alter it and redistribute it ++ * freely, subject to the following restrictions: ++ * ++ * 1. The origin of this software must not be misrepresented; you must not ++ * claim that you wrote the original software. If you use this software ++ * in a product, an acknowledgment in the product documentation would be ++ * appreciated but is not required. ++ * 2. Altered source versions must be plainly marked as such, and must not be ++ * misrepresented as being the original software. ++ * 3. This notice may not be removed or altered from any source distribution. ++ */ ++ ++/* ACCURATE INTEGER FORWARD DCT */ ++ ++#include "jsimd_e2k.h" ++ ++ ++#define F_0_298 2446 /* FIX(0.298631336) */ ++#define F_0_390 3196 /* FIX(0.390180644) */ ++#define F_0_541 4433 /* FIX(0.541196100) */ ++#define F_0_765 6270 /* FIX(0.765366865) */ ++#define F_0_899 7373 /* FIX(0.899976223) */ ++#define F_1_175 9633 /* FIX(1.175875602) */ ++#define F_1_501 12299 /* FIX(1.501321110) */ ++#define F_1_847 15137 /* FIX(1.847759065) */ ++#define F_1_961 16069 /* FIX(1.961570560) */ ++#define F_2_053 16819 /* FIX(2.053119869) */ ++#define F_2_562 20995 /* FIX(2.562915447) */ ++#define F_3_072 25172 /* FIX(3.072711026) */ ++ ++#define CONST_BITS 13 ++#define PASS1_BITS 2 ++#define DESCALE_P1 (CONST_BITS - PASS1_BITS) ++#define DESCALE_P2 (CONST_BITS + PASS1_BITS) ++ ++ ++#define DO_FDCT_COMMON(PASS) { \ ++ /* (Original) \ ++ * z1 = (tmp12 + tmp13) * 0.541196100; \ ++ * data2 = z1 + tmp13 * 0.765366865; \ ++ * data6 = z1 + tmp12 * -1.847759065; \ ++ * \ ++ * (This implementation) \ ++ * data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; \ ++ * data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); \ ++ */ \ ++ \ ++ tmp1312l = _mm_unpacklo_epi16(tmp13, tmp12); \ ++ tmp1312h = _mm_unpackhi_epi16(tmp13, tmp12); \ ++ \ ++ out2l = _mm_add_epi32(_mm_madd_epi16(tmp1312l, pw_f130_f054), pd_descale_p##PASS); \ ++ out2h = _mm_add_epi32(_mm_madd_epi16(tmp1312h, pw_f130_f054), pd_descale_p##PASS); \ ++ out6l = _mm_add_epi32(_mm_madd_epi16(tmp1312l, pw_f054_mf130), pd_descale_p##PASS); \ ++ out6h = _mm_add_epi32(_mm_madd_epi16(tmp1312h, pw_f054_mf130), pd_descale_p##PASS); \ ++ \ ++ out2l = _mm_srai_epi32(out2l, DESCALE_P##PASS); \ ++ out2h = _mm_srai_epi32(out2h, DESCALE_P##PASS); \ ++ out6l = _mm_srai_epi32(out6l, DESCALE_P##PASS); \ ++ out6h = _mm_srai_epi32(out6h, DESCALE_P##PASS); \ ++ \ ++ out2 = _mm_packs_epi32(out2l, out2h); \ ++ out6 = _mm_packs_epi32(out6l, out6h); \ ++ \ ++ /* Odd part */ \ ++ \ ++ z3 = _mm_add_epi16(tmp4, tmp6); \ ++ z4 = _mm_add_epi16(tmp5, tmp7); \ ++ \ ++ /* (Original) \ ++ * z5 = (z3 + z4) * 1.175875602; \ ++ * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \ ++ * z3 += z5; z4 += z5; \ ++ * \ ++ * (This implementation) \ ++ * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \ ++ * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \ ++ */ \ ++ \ ++ z34l = _mm_unpacklo_epi16(z3, z4); \ ++ z34h = _mm_unpackhi_epi16(z3, z4); \ ++ \ ++ z3l = _mm_add_epi32(_mm_madd_epi16(z34l, pw_mf078_f117), pd_descale_p##PASS); \ ++ z3h = _mm_add_epi32(_mm_madd_epi16(z34h, pw_mf078_f117), pd_descale_p##PASS); \ ++ z4l = _mm_add_epi32(_mm_madd_epi16(z34l, pw_f117_f078), pd_descale_p##PASS); \ ++ z4h = _mm_add_epi32(_mm_madd_epi16(z34h, pw_f117_f078), pd_descale_p##PASS); \ ++ \ ++ /* (Original) \ ++ * z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; \ ++ * tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; \ ++ * tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; \ ++ * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \ ++ * data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; \ ++ * data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; \ ++ * \ ++ * (This implementation) \ ++ * tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; \ ++ * tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; \ ++ * tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); \ ++ * tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); \ ++ * data7 = tmp4 + z3; data5 = tmp5 + z4; \ ++ * data3 = tmp6 + z3; data1 = tmp7 + z4; \ ++ */ \ ++ \ ++ tmp47l = _mm_unpacklo_epi16(tmp4, tmp7); \ ++ tmp47h = _mm_unpackhi_epi16(tmp4, tmp7); \ ++ \ ++ out7l = _mm_add_epi32(_mm_madd_epi16(tmp47l, pw_mf060_mf089), z3l); \ ++ out7h = _mm_add_epi32(_mm_madd_epi16(tmp47h, pw_mf060_mf089), z3h); \ ++ out1l = _mm_add_epi32(_mm_madd_epi16(tmp47l, pw_mf089_f060), z4l); \ ++ out1h = _mm_add_epi32(_mm_madd_epi16(tmp47h, pw_mf089_f060), z4h); \ ++ \ ++ out7l = _mm_srai_epi32(out7l, DESCALE_P##PASS); \ ++ out7h = _mm_srai_epi32(out7h, DESCALE_P##PASS); \ ++ out1l = _mm_srai_epi32(out1l, DESCALE_P##PASS); \ ++ out1h = _mm_srai_epi32(out1h, DESCALE_P##PASS); \ ++ \ ++ out7 = _mm_packs_epi32(out7l, out7h); \ ++ out1 = _mm_packs_epi32(out1l, out1h); \ ++ \ ++ tmp56l = _mm_unpacklo_epi16(tmp5, tmp6); \ ++ tmp56h = _mm_unpackhi_epi16(tmp5, tmp6); \ ++ \ ++ out5l = _mm_add_epi32(_mm_madd_epi16(tmp56l, pw_mf050_mf256), z4l); \ ++ out5h = _mm_add_epi32(_mm_madd_epi16(tmp56h, pw_mf050_mf256), z4h); \ ++ out3l = _mm_add_epi32(_mm_madd_epi16(tmp56l, pw_mf256_f050), z3l); \ ++ out3h = _mm_add_epi32(_mm_madd_epi16(tmp56h, pw_mf256_f050), z3h); \ ++ \ ++ out5l = _mm_srai_epi32(out5l, DESCALE_P##PASS); \ ++ out5h = _mm_srai_epi32(out5h, DESCALE_P##PASS); \ ++ out3l = _mm_srai_epi32(out3l, DESCALE_P##PASS); \ ++ out3h = _mm_srai_epi32(out3h, DESCALE_P##PASS); \ ++ \ ++ out5 = _mm_packs_epi32(out5l, out5h); \ ++ out3 = _mm_packs_epi32(out3l, out3h); \ ++} ++ ++#define DO_FDCT_PASS1() { \ ++ /* Even part */ \ ++ \ ++ tmp10 = _mm_add_epi16(tmp0, tmp3); \ ++ tmp13 = _mm_sub_epi16(tmp0, tmp3); \ ++ tmp11 = _mm_add_epi16(tmp1, tmp2); \ ++ tmp12 = _mm_sub_epi16(tmp1, tmp2); \ ++ \ ++ out0 = _mm_add_epi16(tmp10, tmp11); \ ++ out0 = _mm_slli_epi16(out0, PASS1_BITS); \ ++ out4 = _mm_sub_epi16(tmp10, tmp11); \ ++ out4 = _mm_slli_epi16(out4, PASS1_BITS); \ ++ \ ++ DO_FDCT_COMMON(1); \ ++} ++ ++#define DO_FDCT_PASS2() { \ ++ /* Even part */ \ ++ \ ++ tmp10 = _mm_add_epi16(tmp0, tmp3); \ ++ tmp13 = _mm_sub_epi16(tmp0, tmp3); \ ++ tmp11 = _mm_add_epi16(tmp1, tmp2); \ ++ tmp12 = _mm_sub_epi16(tmp1, tmp2); \ ++ \ ++ out0 = _mm_add_epi16(tmp10, tmp11); \ ++ out0 = _mm_add_epi16(out0, pw_descale_p2x); \ ++ out0 = _mm_srai_epi16(out0, PASS1_BITS); \ ++ out4 = _mm_sub_epi16(tmp10, tmp11); \ ++ out4 = _mm_add_epi16(out4, pw_descale_p2x); \ ++ out4 = _mm_srai_epi16(out4, PASS1_BITS); \ ++ \ ++ DO_FDCT_COMMON(2); \ ++} ++ ++ ++void jsimd_fdct_islow_e2k(DCTELEM *data) ++{ ++ __m128i row0, row1, row2, row3, row4, row5, row6, row7, ++ col0, col1, col2, col3, col4, col5, col6, col7, ++ tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13, ++ tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h, ++ z3, z4, z34l, z34h, ++ out0, out1, out2, out3, out4, out5, out6, out7; ++ __m128i z3l, z3h, z4l, z4h, ++ out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h, ++ out7l, out7h; ++ ++ /* Constants */ ++ __m128i pw_f130_f054 = _mm_setr_epi16(__4X2(F_0_541 + F_0_765, F_0_541)), ++ pw_f054_mf130 = _mm_setr_epi16(__4X2(F_0_541, F_0_541 - F_1_847)), ++ pw_mf078_f117 = _mm_setr_epi16(__4X2(F_1_175 - F_1_961, F_1_175)), ++ pw_f117_f078 = _mm_setr_epi16(__4X2(F_1_175, F_1_175 - F_0_390)), ++ pw_mf060_mf089 = _mm_setr_epi16(__4X2(F_0_298 - F_0_899, -F_0_899)), ++ pw_mf089_f060 = _mm_setr_epi16(__4X2(-F_0_899, F_1_501 - F_0_899)), ++ pw_mf050_mf256 = _mm_setr_epi16(__4X2(F_2_053 - F_2_562, -F_2_562)), ++ pw_mf256_f050 = _mm_setr_epi16(__4X2(-F_2_562, F_3_072 - F_2_562)), ++ pw_descale_p2x = _mm_set1_epi16(1 << (PASS1_BITS - 1)), ++ pd_descale_p1 = _mm_set1_epi32(1 << (DESCALE_P1 - 1)), ++ pd_descale_p2 = _mm_set1_epi32(1 << (DESCALE_P2 - 1)); ++ ++ /* Pass 1: process rows */ ++ ++ row0 = VEC_LD(data + 0 * 8); ++ row1 = VEC_LD(data + 1 * 8); ++ row2 = VEC_LD(data + 2 * 8); ++ row3 = VEC_LD(data + 3 * 8); ++ row4 = VEC_LD(data + 4 * 8); ++ row5 = VEC_LD(data + 5 * 8); ++ row6 = VEC_LD(data + 6 * 8); ++ row7 = VEC_LD(data + 7 * 8); ++ ++ TRANSPOSE(row, col); ++ ++ tmp0 = _mm_add_epi16(col0, col7); ++ tmp7 = _mm_sub_epi16(col0, col7); ++ tmp1 = _mm_add_epi16(col1, col6); ++ tmp6 = _mm_sub_epi16(col1, col6); ++ tmp2 = _mm_add_epi16(col2, col5); ++ tmp5 = _mm_sub_epi16(col2, col5); ++ tmp3 = _mm_add_epi16(col3, col4); ++ tmp4 = _mm_sub_epi16(col3, col4); ++ ++ DO_FDCT_PASS1(); ++ ++ /* Pass 2: process columns */ ++ ++ TRANSPOSE(out, row); ++ ++ tmp0 = _mm_add_epi16(row0, row7); ++ tmp7 = _mm_sub_epi16(row0, row7); ++ tmp1 = _mm_add_epi16(row1, row6); ++ tmp6 = _mm_sub_epi16(row1, row6); ++ tmp2 = _mm_add_epi16(row2, row5); ++ tmp5 = _mm_sub_epi16(row2, row5); ++ tmp3 = _mm_add_epi16(row3, row4); ++ tmp4 = _mm_sub_epi16(row3, row4); ++ ++ DO_FDCT_PASS2(); ++ ++ VEC_ST(data + 0 * 8, out0); ++ VEC_ST(data + 1 * 8, out1); ++ VEC_ST(data + 2 * 8, out2); ++ VEC_ST(data + 3 * 8, out3); ++ VEC_ST(data + 4 * 8, out4); ++ VEC_ST(data + 5 * 8, out5); ++ VEC_ST(data + 6 * 8, out6); ++ VEC_ST(data + 7 * 8, out7); ++} +diff --git a/simd/e2k/jidctflt-e2k.c b/simd/e2k/jidctflt-e2k.c +new file mode 100644 +index 0000000..7682965 +--- /dev/null ++++ b/simd/e2k/jidctflt-e2k.c +@@ -0,0 +1,215 @@ ++/* ++ * Elbrus optimizations for libjpeg-turbo ++ * ++ * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. ++ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd ++ * ++ * This software is provided 'as-is', without any express or implied ++ * warranty. In no event will the authors be held liable for any damages ++ * arising from the use of this software. ++ * ++ * Permission is granted to anyone to use this software for any purpose, ++ * including commercial applications, and to alter it and redistribute it ++ * freely, subject to the following restrictions: ++ * ++ * 1. The origin of this software must not be misrepresented; you must not ++ * claim that you wrote the original software. If you use this software ++ * in a product, an acknowledgment in the product documentation would be ++ * appreciated but is not required. ++ * 2. Altered source versions must be plainly marked as such, and must not be ++ * misrepresented as being the original software. ++ * 3. This notice may not be removed or altered from any source distribution. ++ */ ++ ++/* FLOAT INVERSE DCT */ ++ ++#include "jsimd_e2k.h" ++ ++#define DO_IDCT(in, out) { \ ++ /* Even part */ \ ++ \ ++ tmp10 = _mm_add_ps(in##0, in##4); \ ++ tmp11 = _mm_sub_ps(in##0, in##4); \ ++ \ ++ tmp13 = _mm_add_ps(in##2, in##6); \ ++ tmp12 = _mm_sub_ps(in##2, in##6); \ ++ tmp12 = _mm_sub_ps(_mm_mul_ps(tmp12, pd_f1414), tmp13); \ ++ \ ++ tmp0 = _mm_add_ps(tmp10, tmp13); \ ++ tmp3 = _mm_sub_ps(tmp10, tmp13); \ ++ tmp1 = _mm_add_ps(tmp11, tmp12); \ ++ tmp2 = _mm_sub_ps(tmp11, tmp12); \ ++ \ ++ /* Odd part */ \ ++ \ ++ z13 = _mm_add_ps(in##5, in##3); \ ++ z10 = _mm_sub_ps(in##5, in##3); \ ++ z11 = _mm_add_ps(in##1, in##7); \ ++ z12 = _mm_sub_ps(in##1, in##7); \ ++ \ ++ tmp7 = _mm_add_ps(z11, z13); \ ++ tmp11 = _mm_sub_ps(z11, z13); \ ++ tmp11 = _mm_mul_ps(tmp11, pd_f1414); \ ++ \ ++ z5 = _mm_mul_ps(_mm_add_ps(z10, z12), pd_f1847); \ ++ tmp10 = _mm_sub_ps(z5, _mm_mul_ps(z12, pd_f1082)); \ ++ tmp12 = _mm_sub_ps(z5, _mm_mul_ps(z10, pd_f2613)); \ ++ \ ++ tmp6 = _mm_sub_ps(tmp12, tmp7); \ ++ tmp5 = _mm_sub_ps(tmp11, tmp6); \ ++ tmp4 = _mm_sub_ps(tmp10, tmp5); \ ++ \ ++ out##0 = _mm_add_ps(tmp0, tmp7); \ ++ out##7 = _mm_sub_ps(tmp0, tmp7); \ ++ out##1 = _mm_add_ps(tmp1, tmp6); \ ++ out##6 = _mm_sub_ps(tmp1, tmp6); \ ++ out##2 = _mm_add_ps(tmp2, tmp5); \ ++ out##5 = _mm_sub_ps(tmp2, tmp5); \ ++ out##3 = _mm_add_ps(tmp3, tmp4); \ ++ out##4 = _mm_sub_ps(tmp3, tmp4); \ ++} ++ ++#define QUANT_MUL(a, b, c, d, l, lo, i) \ ++ out0 = _mm_srai_epi32(_mm_unpack##lo##_epi16(col##a, col##a), 16); \ ++ out1 = _mm_srai_epi32(_mm_unpack##lo##_epi16(col##b, col##b), 16); \ ++ out2 = _mm_srai_epi32(_mm_unpack##lo##_epi16(col##c, col##c), 16); \ ++ out3 = _mm_srai_epi32(_mm_unpack##lo##_epi16(col##d, col##d), 16); \ ++ l##a = _mm_cvtepi32_ps(out0); \ ++ l##b = _mm_cvtepi32_ps(out1); \ ++ l##c = _mm_cvtepi32_ps(out2); \ ++ l##d = _mm_cvtepi32_ps(out3); \ ++ l##a = _mm_mul_ps(l##a, _mm_load_ps(dct_table + a * 8 + i)); \ ++ l##b = _mm_mul_ps(l##b, _mm_load_ps(dct_table + b * 8 + i)); \ ++ l##c = _mm_mul_ps(l##c, _mm_load_ps(dct_table + c * 8 + i)); \ ++ l##d = _mm_mul_ps(l##d, _mm_load_ps(dct_table + d * 8 + i)); ++ ++ ++void jsimd_idct_float_e2k(void *dct_table_, JCOEFPTR coef_block, ++ JSAMPARRAY output_buf, JDIMENSION output_col) ++{ ++ float *dct_table = (float *)dct_table_; ++ ++ __m128i col0, col1, col2, col3, col4, col5, col6, col7, ++ out0, out1, out2, out3, out4, out5, out6, out7, row0, row1, row2, row3; ++ __m128 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, ++ tmp10, tmp11, tmp12, tmp13, z5, z10, z11, z12, z13; ++ __m128 l0, l1, l2, l3, l4, l5, l6, l7; ++ __m128 h0, h1, h2, h3, h4, h5, h6, h7; ++ __m128 x0, x1, x2, x3, x4, x5, x6, x7; ++ __m128 y0, y1, y2, y3, y4, y5, y6, y7; ++ ++ /* Constants */ ++ __m128 pd_f1414 = _mm_set1_ps(1.414213562f), ++ pd_f1847 = _mm_set1_ps(1.847759065f), ++ pd_f1082 = _mm_set1_ps(1.082392200f), ++ pd_f2613 = _mm_set1_ps(2.613125930f); ++ ++ /* Pass 1: process columns */ ++ ++ col0 = VEC_LD(coef_block + 0 * 8); ++ col1 = VEC_LD(coef_block + 1 * 8); ++ col2 = VEC_LD(coef_block + 2 * 8); ++ col3 = VEC_LD(coef_block + 3 * 8); ++ col4 = VEC_LD(coef_block + 4 * 8); ++ col5 = VEC_LD(coef_block + 5 * 8); ++ col6 = VEC_LD(coef_block + 6 * 8); ++ col7 = VEC_LD(coef_block + 7 * 8); ++ ++ out1 = _mm_or_si128(col1, col2); ++ out2 = _mm_or_si128(col3, col4); ++ out1 = _mm_or_si128(out1, out2); ++ out3 = _mm_or_si128(col5, col6); ++ out3 = _mm_or_si128(out3, col7); ++ out1 = _mm_or_si128(out1, out3); ++ ++ if (VEC_ISZERO(out1)) { ++ /* AC terms all zero */ ++ ++ out0 = _mm_srai_epi32(_mm_unpacklo_epi16(col0, col0), 16); ++ out1 = _mm_srai_epi32(_mm_unpackhi_epi16(col0, col0), 16); ++ tmp0 = _mm_cvtepi32_ps(out0); ++ tmp1 = _mm_cvtepi32_ps(out1); ++ tmp0 = _mm_mul_ps(tmp0, _mm_load_ps(dct_table)); ++ tmp1 = _mm_mul_ps(tmp1, _mm_load_ps(dct_table + 4)); ++ ++ l0 = h0 = _mm_shuffle_ps(tmp0, tmp0, 0x00); ++ l1 = h1 = _mm_shuffle_ps(tmp0, tmp0, 0x55); ++ l2 = h2 = _mm_shuffle_ps(tmp0, tmp0, 0xaa); ++ l3 = h3 = _mm_shuffle_ps(tmp0, tmp0, 0xff); ++ l4 = h4 = _mm_shuffle_ps(tmp1, tmp1, 0x00); ++ l5 = h5 = _mm_shuffle_ps(tmp1, tmp1, 0x55); ++ l6 = h6 = _mm_shuffle_ps(tmp1, tmp1, 0xaa); ++ l7 = h7 = _mm_shuffle_ps(tmp1, tmp1, 0xff); ++ ++ } else { ++ ++ QUANT_MUL(0, 2, 4, 6, l, lo, 0) ++ QUANT_MUL(1, 3, 5, 7, l, lo, 0) ++ DO_IDCT(l, x); ++ ++ QUANT_MUL(0, 2, 4, 6, h, hi, 4) ++ QUANT_MUL(1, 3, 5, 7, h, hi, 4) ++ DO_IDCT(h, y); ++ ++ TRANSPOSE_FLOAT(x0, x1, x2, x3, l0, l1, l2, l3) ++ TRANSPOSE_FLOAT(x4, x5, x6, x7, h0, h1, h2, h3) ++ TRANSPOSE_FLOAT(y0, y1, y2, y3, l4, l5, l6, l7) ++ TRANSPOSE_FLOAT(y4, y5, y6, y7, h4, h5, h6, h7) ++ } ++ ++ /* Pass 2: process rows */ ++ ++ DO_IDCT(l, x); ++ DO_IDCT(h, y); ++ ++#ifdef JSIMD_SAME_ROUNDING ++#define OUT_ROUND(i) \ ++ tmp0 = _mm_add_ps(_mm_mul_ps(x##i, pd_f0125), pd_cj_rnd); \ ++ tmp1 = _mm_add_ps(_mm_mul_ps(y##i, pd_f0125), pd_cj_rnd); \ ++ out##i = _mm_packs_epi32(_mm_cvttps_epi32(tmp0), _mm_cvttps_epi32(tmp1)); ++ ++ { ++ __m128 pd_cj_rnd = _mm_set1_ps(0.5f + CENTERJSAMPLE), ++ pd_f0125 = _mm_set1_ps(0.125f); ++ ++ OUT_ROUND(0) OUT_ROUND(1) ++ OUT_ROUND(2) OUT_ROUND(3) ++ OUT_ROUND(4) OUT_ROUND(5) ++ OUT_ROUND(6) OUT_ROUND(7) ++ } ++ row0 = _mm_packus_epi16(out0, out1); ++ row1 = _mm_packus_epi16(out2, out3); ++ row2 = _mm_packus_epi16(out4, out5); ++ row3 = _mm_packus_epi16(out6, out7); ++ ++ TRANSPOSE8(row, col) TRANSPOSE8(col, row) TRANSPOSE8(row, col) ++#else /* faster, slightly differ in rounding */ ++#define OUT_ROUND(i, a, b) out##i = _mm_blendv_epi8( \ ++ _mm_slli_epi32(_mm_castps_si128(_mm_add_ps(b, pd_round)), 16), \ ++ _mm_castps_si128(_mm_add_ps(a, pd_round)), pd_mask); ++ ++ { ++ __m128i pd_mask = _mm_set1_epi32(0xffff); ++ __m128 pd_round = _mm_set1_ps((3 << 22 | CENTERJSAMPLE) * 8); ++ ++ OUT_ROUND(0, x0, x4) OUT_ROUND(1, y0, y4) ++ OUT_ROUND(2, x1, x5) OUT_ROUND(3, y1, y5) ++ OUT_ROUND(4, x2, x6) OUT_ROUND(5, y2, y6) ++ OUT_ROUND(6, x3, x7) OUT_ROUND(7, y3, y7) ++ } ++ row0 = _mm_packus_epi16(out0, out1); ++ row1 = _mm_packus_epi16(out2, out3); ++ row2 = _mm_packus_epi16(out4, out5); ++ row3 = _mm_packus_epi16(out6, out7); ++ ++ TRANSPOSE8(row, out) TRANSPOSE8(out, col) ++#endif ++ VEC_STL(output_buf[0] + output_col, col0); ++ VEC_STH(output_buf[1] + output_col, col0); ++ VEC_STL(output_buf[2] + output_col, col1); ++ VEC_STH(output_buf[3] + output_col, col1); ++ VEC_STL(output_buf[4] + output_col, col2); ++ VEC_STH(output_buf[5] + output_col, col2); ++ VEC_STL(output_buf[6] + output_col, col3); ++ VEC_STH(output_buf[7] + output_col, col3); ++} +diff --git a/simd/e2k/jidctfst-e2k.c b/simd/e2k/jidctfst-e2k.c +new file mode 100644 +index 0000000..18bc425 +--- /dev/null ++++ b/simd/e2k/jidctfst-e2k.c +@@ -0,0 +1,187 @@ ++/* ++ * Elbrus optimizations for libjpeg-turbo ++ * ++ * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. ++ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd ++ * ++ * This software is provided 'as-is', without any express or implied ++ * warranty. In no event will the authors be held liable for any damages ++ * arising from the use of this software. ++ * ++ * Permission is granted to anyone to use this software for any purpose, ++ * including commercial applications, and to alter it and redistribute it ++ * freely, subject to the following restrictions: ++ * ++ * 1. The origin of this software must not be misrepresented; you must not ++ * claim that you wrote the original software. If you use this software ++ * in a product, an acknowledgment in the product documentation would be ++ * appreciated but is not required. ++ * 2. Altered source versions must be plainly marked as such, and must not be ++ * misrepresented as being the original software. ++ * 3. This notice may not be removed or altered from any source distribution. ++ */ ++ ++/* FAST INTEGER INVERSE DCT */ ++ ++#include "jsimd_e2k.h" ++ ++ ++#define F_1_082 277 /* FIX(1.082392200) */ ++#define F_1_414 362 /* FIX(1.414213562) */ ++#define F_1_847 473 /* FIX(1.847759065) */ ++#define F_2_613 669 /* FIX(2.613125930) */ ++#define F_1_613 (F_2_613 - 256) /* FIX(2.613125930) - FIX(1) */ ++ ++#define CONST_BITS 8 ++#define PASS1_BITS 2 ++#define PRE_MULTIPLY_SCALE_BITS 2 ++#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) ++ ++ ++#define DO_IDCT(in) { \ ++ /* Even part */ \ ++ \ ++ tmp10 = _mm_add_epi16(in##0, in##4); \ ++ tmp11 = _mm_sub_epi16(in##0, in##4); \ ++ tmp13 = _mm_add_epi16(in##2, in##6); \ ++ \ ++ tmp12 = _mm_sub_epi16(in##2, in##6); \ ++ tmp12 = _mm_slli_epi16(tmp12, PRE_MULTIPLY_SCALE_BITS); \ ++ tmp12 = _mm_mulhi_epi16(tmp12, pw_F1414); \ ++ tmp12 = _mm_sub_epi16(tmp12, tmp13); \ ++ \ ++ tmp0 = _mm_add_epi16(tmp10, tmp13); \ ++ tmp3 = _mm_sub_epi16(tmp10, tmp13); \ ++ tmp1 = _mm_add_epi16(tmp11, tmp12); \ ++ tmp2 = _mm_sub_epi16(tmp11, tmp12); \ ++ \ ++ /* Odd part */ \ ++ \ ++ z13 = _mm_add_epi16(in##5, in##3); \ ++ z10 = _mm_sub_epi16(in##5, in##3); \ ++ z10s = _mm_slli_epi16(z10, PRE_MULTIPLY_SCALE_BITS); \ ++ z11 = _mm_add_epi16(in##1, in##7); \ ++ z12s = _mm_sub_epi16(in##1, in##7); \ ++ z12s = _mm_slli_epi16(z12s, PRE_MULTIPLY_SCALE_BITS); \ ++ \ ++ tmp11 = _mm_sub_epi16(z11, z13); \ ++ tmp11 = _mm_slli_epi16(tmp11, PRE_MULTIPLY_SCALE_BITS); \ ++ tmp11 = _mm_mulhi_epi16(tmp11, pw_F1414); \ ++ \ ++ tmp7 = _mm_add_epi16(z11, z13); \ ++ \ ++ /* To avoid overflow... \ ++ * \ ++ * (Original) \ ++ * tmp12 = -2.613125930 * z10 + z5; \ ++ * \ ++ * (This implementation) \ ++ * tmp12 = (-1.613125930 - 1) * z10 + z5; \ ++ * = -1.613125930 * z10 - z10 + z5; \ ++ */ \ ++ \ ++ z5 = _mm_add_epi16(z10s, z12s); \ ++ z5 = _mm_mulhi_epi16(z5, pw_F1847); \ ++ \ ++ tmp10 = _mm_mulhi_epi16(z12s, pw_F1082); \ ++ tmp10 = _mm_sub_epi16(tmp10, z5); \ ++ tmp12 = _mm_add_epi16(_mm_mulhi_epi16(z10s, pw_MF1613), z5); \ ++ tmp12 = _mm_sub_epi16(tmp12, z10); \ ++ \ ++ tmp6 = _mm_sub_epi16(tmp12, tmp7); \ ++ tmp5 = _mm_sub_epi16(tmp11, tmp6); \ ++ tmp4 = _mm_add_epi16(tmp10, tmp5); \ ++ \ ++ out0 = _mm_add_epi16(tmp0, tmp7); \ ++ out1 = _mm_add_epi16(tmp1, tmp6); \ ++ out2 = _mm_add_epi16(tmp2, tmp5); \ ++ out3 = _mm_sub_epi16(tmp3, tmp4); \ ++ out4 = _mm_add_epi16(tmp3, tmp4); \ ++ out5 = _mm_sub_epi16(tmp2, tmp5); \ ++ out6 = _mm_sub_epi16(tmp1, tmp6); \ ++ out7 = _mm_sub_epi16(tmp0, tmp7); \ ++} ++ ++ ++void jsimd_idct_ifast_e2k(void *dct_table_, JCOEFPTR coef_block, ++ JSAMPARRAY output_buf, JDIMENSION output_col) ++{ ++ short *dct_table = (short *)dct_table_; ++ ++ __m128i row0, row1, row2, row3, row4, row5, row6, row7, ++ col0, col1, col2, col3, col4, col5, col6, col7, ++ quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7, ++ tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13, ++ z5, z10, z10s, z11, z12s, z13, ++ out0, out1, out2, out3, out4, out5, out6, out7; ++ ++ /* Constants */ ++ __m128i pw_F1414 = _mm_set1_epi16(F_1_414 << CONST_SHIFT), ++ pw_F1847 = _mm_set1_epi16(F_1_847 << CONST_SHIFT), ++ pw_MF1613 = _mm_set1_epi16(-F_1_613 << CONST_SHIFT), ++ pw_F1082 = _mm_set1_epi16(F_1_082 << CONST_SHIFT); ++ ++ /* Pass 1: process columns */ ++ ++ col0 = VEC_LD(coef_block + 0 * 8); ++ col1 = VEC_LD(coef_block + 1 * 8); ++ col2 = VEC_LD(coef_block + 2 * 8); ++ col3 = VEC_LD(coef_block + 3 * 8); ++ col4 = VEC_LD(coef_block + 4 * 8); ++ col5 = VEC_LD(coef_block + 5 * 8); ++ col6 = VEC_LD(coef_block + 6 * 8); ++ col7 = VEC_LD(coef_block + 7 * 8); ++ ++ tmp1 = _mm_or_si128(col1, col2); ++ tmp2 = _mm_or_si128(col3, col4); ++ tmp1 = _mm_or_si128(tmp1, tmp2); ++ tmp3 = _mm_or_si128(col5, col6); ++ tmp3 = _mm_or_si128(tmp3, col7); ++ tmp1 = _mm_or_si128(tmp1, tmp3); ++ ++ quant0 = VEC_LD(dct_table); ++ col0 = _mm_mullo_epi16(col0, quant0); ++ ++ if (VEC_ISZERO(tmp1)) { ++ /* AC terms all zero */ ++ ++ IDCT_SPLAT8(col0); ++ ++ } else { ++ ++ quant1 = VEC_LD(dct_table + 1 * 8); ++ quant2 = VEC_LD(dct_table + 2 * 8); ++ quant3 = VEC_LD(dct_table + 3 * 8); ++ quant4 = VEC_LD(dct_table + 4 * 8); ++ quant5 = VEC_LD(dct_table + 5 * 8); ++ quant6 = VEC_LD(dct_table + 6 * 8); ++ quant7 = VEC_LD(dct_table + 7 * 8); ++ ++ col1 = _mm_mullo_epi16(col1, quant1); ++ col2 = _mm_mullo_epi16(col2, quant2); ++ col3 = _mm_mullo_epi16(col3, quant3); ++ col4 = _mm_mullo_epi16(col4, quant4); ++ col5 = _mm_mullo_epi16(col5, quant5); ++ col6 = _mm_mullo_epi16(col6, quant6); ++ col7 = _mm_mullo_epi16(col7, quant7); ++ ++ DO_IDCT(col); ++ ++ TRANSPOSE(out, row); ++ } ++ ++ /* Pass 2: process rows */ ++ ++ DO_IDCT(row); ++ ++ out0 = _mm_srai_epi16(out0, PASS1_BITS + 3); ++ out1 = _mm_srai_epi16(out1, PASS1_BITS + 3); ++ out2 = _mm_srai_epi16(out2, PASS1_BITS + 3); ++ out3 = _mm_srai_epi16(out3, PASS1_BITS + 3); ++ out4 = _mm_srai_epi16(out4, PASS1_BITS + 3); ++ out5 = _mm_srai_epi16(out5, PASS1_BITS + 3); ++ out6 = _mm_srai_epi16(out6, PASS1_BITS + 3); ++ out7 = _mm_srai_epi16(out7, PASS1_BITS + 3); ++ ++ IDCT_SAVE(); ++} +diff --git a/simd/e2k/jidctint-e2k.c b/simd/e2k/jidctint-e2k.c +new file mode 100644 +index 0000000..7bb79c0 +--- /dev/null ++++ b/simd/e2k/jidctint-e2k.c +@@ -0,0 +1,294 @@ ++/* ++ * Elbrus optimizations for libjpeg-turbo ++ * ++ * Copyright (C) 2014-2015, 2020, D. R. Commander. All Rights Reserved. ++ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd ++ * ++ * This software is provided 'as-is', without any express or implied ++ * warranty. In no event will the authors be held liable for any damages ++ * arising from the use of this software. ++ * ++ * Permission is granted to anyone to use this software for any purpose, ++ * including commercial applications, and to alter it and redistribute it ++ * freely, subject to the following restrictions: ++ * ++ * 1. The origin of this software must not be misrepresented; you must not ++ * claim that you wrote the original software. If you use this software ++ * in a product, an acknowledgment in the product documentation would be ++ * appreciated but is not required. ++ * 2. Altered source versions must be plainly marked as such, and must not be ++ * misrepresented as being the original software. ++ * 3. This notice may not be removed or altered from any source distribution. ++ */ ++ ++/* ACCURATE INTEGER INVERSE DCT */ ++ ++#include "jsimd_e2k.h" ++ ++ ++#define F_0_298 2446 /* FIX(0.298631336) */ ++#define F_0_390 3196 /* FIX(0.390180644) */ ++#define F_0_541 4433 /* FIX(0.541196100) */ ++#define F_0_765 6270 /* FIX(0.765366865) */ ++#define F_0_899 7373 /* FIX(0.899976223) */ ++#define F_1_175 9633 /* FIX(1.175875602) */ ++#define F_1_501 12299 /* FIX(1.501321110) */ ++#define F_1_847 15137 /* FIX(1.847759065) */ ++#define F_1_961 16069 /* FIX(1.961570560) */ ++#define F_2_053 16819 /* FIX(2.053119869) */ ++#define F_2_562 20995 /* FIX(2.562915447) */ ++#define F_3_072 25172 /* FIX(3.072711026) */ ++ ++#define CONST_BITS 13 ++#define PASS1_BITS 2 ++#define DESCALE_P1 (CONST_BITS - PASS1_BITS) ++#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3) ++ ++ ++#define DO_IDCT(in, PASS) { \ ++ /* Even part \ ++ * \ ++ * (Original) \ ++ * z1 = (z2 + z3) * 0.541196100; \ ++ * tmp2 = z1 + z3 * -1.847759065; \ ++ * tmp3 = z1 + z2 * 0.765366865; \ ++ * \ ++ * (This implementation) \ ++ * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \ ++ * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \ ++ */ \ ++ \ ++ in##26l = _mm_unpacklo_epi16(in##2, in##6); \ ++ in##26h = _mm_unpackhi_epi16(in##2, in##6); \ ++ \ ++ tmp3l = _mm_madd_epi16(in##26l, pw_f130_f054); \ ++ tmp3h = _mm_madd_epi16(in##26h, pw_f130_f054); \ ++ tmp2l = _mm_madd_epi16(in##26l, pw_f054_mf130); \ ++ tmp2h = _mm_madd_epi16(in##26h, pw_f054_mf130); \ ++ \ ++ tmp0 = _mm_add_epi16(in##0, in##4); \ ++ tmp1 = _mm_sub_epi16(in##0, in##4); \ ++ \ ++ tmp0l = _mm_unpacklo_epi16(pw_zero, tmp0); \ ++ tmp0h = _mm_unpackhi_epi16(pw_zero, tmp0); \ ++ tmp0l = _mm_srai_epi32(tmp0l, 16 - CONST_BITS); \ ++ tmp0h = _mm_srai_epi32(tmp0h, 16 - CONST_BITS); \ ++ tmp0l = _mm_add_epi32(tmp0l, pd_descale_p##PASS); \ ++ tmp0h = _mm_add_epi32(tmp0h, pd_descale_p##PASS); \ ++ \ ++ tmp10l = _mm_add_epi32(tmp0l, tmp3l); \ ++ tmp10h = _mm_add_epi32(tmp0h, tmp3h); \ ++ tmp13l = _mm_sub_epi32(tmp0l, tmp3l); \ ++ tmp13h = _mm_sub_epi32(tmp0h, tmp3h); \ ++ \ ++ tmp1l = _mm_unpacklo_epi16(pw_zero, tmp1); \ ++ tmp1h = _mm_unpackhi_epi16(pw_zero, tmp1); \ ++ tmp1l = _mm_srai_epi32(tmp1l, 16 - CONST_BITS); \ ++ tmp1h = _mm_srai_epi32(tmp1h, 16 - CONST_BITS); \ ++ tmp1l = _mm_add_epi32(tmp1l, pd_descale_p##PASS); \ ++ tmp1h = _mm_add_epi32(tmp1h, pd_descale_p##PASS); \ ++ \ ++ tmp11l = _mm_add_epi32(tmp1l, tmp2l); \ ++ tmp11h = _mm_add_epi32(tmp1h, tmp2h); \ ++ tmp12l = _mm_sub_epi32(tmp1l, tmp2l); \ ++ tmp12h = _mm_sub_epi32(tmp1h, tmp2h); \ ++ \ ++ /* Odd part */ \ ++ \ ++ z3 = _mm_add_epi16(in##3, in##7); \ ++ z4 = _mm_add_epi16(in##1, in##5); \ ++ \ ++ /* (Original) \ ++ * z5 = (z3 + z4) * 1.175875602; \ ++ * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \ ++ * z3 += z5; z4 += z5; \ ++ * \ ++ * (This implementation) \ ++ * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \ ++ * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \ ++ */ \ ++ \ ++ z34l = _mm_unpacklo_epi16(z3, z4); \ ++ z34h = _mm_unpackhi_epi16(z3, z4); \ ++ \ ++ z3l = _mm_madd_epi16(z34l, pw_mf078_f117); \ ++ z3h = _mm_madd_epi16(z34h, pw_mf078_f117); \ ++ z4l = _mm_madd_epi16(z34l, pw_f117_f078); \ ++ z4h = _mm_madd_epi16(z34h, pw_f117_f078); \ ++ \ ++ /* (Original) \ ++ * z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; \ ++ * tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; \ ++ * tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; \ ++ * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \ ++ * tmp0 += z1 + z3; tmp1 += z2 + z4; \ ++ * tmp2 += z2 + z3; tmp3 += z1 + z4; \ ++ * \ ++ * (This implementation) \ ++ * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; \ ++ * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; \ ++ * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); \ ++ * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); \ ++ * tmp0 += z3; tmp1 += z4; \ ++ * tmp2 += z3; tmp3 += z4; \ ++ */ \ ++ \ ++ in##71l = _mm_unpacklo_epi16(in##7, in##1); \ ++ in##71h = _mm_unpackhi_epi16(in##7, in##1); \ ++ \ ++ tmp0l = _mm_add_epi32(_mm_madd_epi16(in##71l, pw_mf060_mf089), z3l); \ ++ tmp0h = _mm_add_epi32(_mm_madd_epi16(in##71h, pw_mf060_mf089), z3h); \ ++ tmp3l = _mm_add_epi32(_mm_madd_epi16(in##71l, pw_mf089_f060), z4l); \ ++ tmp3h = _mm_add_epi32(_mm_madd_epi16(in##71h, pw_mf089_f060), z4h); \ ++ \ ++ in##53l = _mm_unpacklo_epi16(in##5, in##3); \ ++ in##53h = _mm_unpackhi_epi16(in##5, in##3); \ ++ \ ++ tmp1l = _mm_add_epi32(_mm_madd_epi16(in##53l, pw_mf050_mf256), z4l); \ ++ tmp1h = _mm_add_epi32(_mm_madd_epi16(in##53h, pw_mf050_mf256), z4h); \ ++ tmp2l = _mm_add_epi32(_mm_madd_epi16(in##53l, pw_mf256_f050), z3l); \ ++ tmp2h = _mm_add_epi32(_mm_madd_epi16(in##53h, pw_mf256_f050), z3h); \ ++ \ ++ /* Final output stage */ \ ++ \ ++ out0l = _mm_add_epi32(tmp10l, tmp3l); \ ++ out0h = _mm_add_epi32(tmp10h, tmp3h); \ ++ out7l = _mm_sub_epi32(tmp10l, tmp3l); \ ++ out7h = _mm_sub_epi32(tmp10h, tmp3h); \ ++ \ ++ out0l = _mm_srai_epi32(out0l, DESCALE_P##PASS); \ ++ out0h = _mm_srai_epi32(out0h, DESCALE_P##PASS); \ ++ out7l = _mm_srai_epi32(out7l, DESCALE_P##PASS); \ ++ out7h = _mm_srai_epi32(out7h, DESCALE_P##PASS); \ ++ \ ++ out0 = _mm_packs_epi32(out0l, out0h); \ ++ out7 = _mm_packs_epi32(out7l, out7h); \ ++ \ ++ out1l = _mm_add_epi32(tmp11l, tmp2l); \ ++ out1h = _mm_add_epi32(tmp11h, tmp2h); \ ++ out6l = _mm_sub_epi32(tmp11l, tmp2l); \ ++ out6h = _mm_sub_epi32(tmp11h, tmp2h); \ ++ \ ++ out1l = _mm_srai_epi32(out1l, DESCALE_P##PASS); \ ++ out1h = _mm_srai_epi32(out1h, DESCALE_P##PASS); \ ++ out6l = _mm_srai_epi32(out6l, DESCALE_P##PASS); \ ++ out6h = _mm_srai_epi32(out6h, DESCALE_P##PASS); \ ++ \ ++ out1 = _mm_packs_epi32(out1l, out1h); \ ++ out6 = _mm_packs_epi32(out6l, out6h); \ ++ \ ++ out2l = _mm_add_epi32(tmp12l, tmp1l); \ ++ out2h = _mm_add_epi32(tmp12h, tmp1h); \ ++ out5l = _mm_sub_epi32(tmp12l, tmp1l); \ ++ out5h = _mm_sub_epi32(tmp12h, tmp1h); \ ++ \ ++ out2l = _mm_srai_epi32(out2l, DESCALE_P##PASS); \ ++ out2h = _mm_srai_epi32(out2h, DESCALE_P##PASS); \ ++ out5l = _mm_srai_epi32(out5l, DESCALE_P##PASS); \ ++ out5h = _mm_srai_epi32(out5h, DESCALE_P##PASS); \ ++ \ ++ out2 = _mm_packs_epi32(out2l, out2h); \ ++ out5 = _mm_packs_epi32(out5l, out5h); \ ++ \ ++ out3l = _mm_add_epi32(tmp13l, tmp0l); \ ++ out3h = _mm_add_epi32(tmp13h, tmp0h); \ ++ out4l = _mm_sub_epi32(tmp13l, tmp0l); \ ++ out4h = _mm_sub_epi32(tmp13h, tmp0h); \ ++ \ ++ out3l = _mm_srai_epi32(out3l, DESCALE_P##PASS); \ ++ out3h = _mm_srai_epi32(out3h, DESCALE_P##PASS); \ ++ out4l = _mm_srai_epi32(out4l, DESCALE_P##PASS); \ ++ out4h = _mm_srai_epi32(out4h, DESCALE_P##PASS); \ ++ \ ++ out3 = _mm_packs_epi32(out3l, out3h); \ ++ out4 = _mm_packs_epi32(out4l, out4h); \ ++} ++ ++ ++void jsimd_idct_islow_e2k(void *dct_table_, JCOEFPTR coef_block, ++ JSAMPARRAY output_buf, JDIMENSION output_col) ++{ ++ short *dct_table = (short *)dct_table_; ++ ++ __m128i row0, row1, row2, row3, row4, row5, row6, row7, ++ col0, col1, col2, col3, col4, col5, col6, col7, ++ quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7, ++ tmp0, tmp1, tmp2, tmp3, z3, z4, ++ z34l, z34h, col71l, col71h, col26l, col26h, col53l, col53h, ++ row71l, row71h, row26l, row26h, row53l, row53h, ++ out0, out1, out2, out3, out4, out5, out6, out7; ++ __m128i tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h, ++ tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h, ++ z3l, z3h, z4l, z4h, ++ out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h, out4l, out4h, ++ out5l, out5h, out6l, out6h, out7l, out7h; ++ ++ /* Constants */ ++ __m128i pw_zero = _mm_setzero_si128(), ++ pw_f130_f054 = _mm_setr_epi16(__4X2(F_0_541 + F_0_765, F_0_541)), ++ pw_f054_mf130 = _mm_setr_epi16(__4X2(F_0_541, F_0_541 - F_1_847)), ++ pw_mf078_f117 = _mm_setr_epi16(__4X2(F_1_175 - F_1_961, F_1_175)), ++ pw_f117_f078 = _mm_setr_epi16(__4X2(F_1_175, F_1_175 - F_0_390)), ++ pw_mf060_mf089 = _mm_setr_epi16(__4X2(F_0_298 - F_0_899, -F_0_899)), ++ pw_mf089_f060 = _mm_setr_epi16(__4X2(-F_0_899, F_1_501 - F_0_899)), ++ pw_mf050_mf256 = _mm_setr_epi16(__4X2(F_2_053 - F_2_562, -F_2_562)), ++ pw_mf256_f050 = _mm_setr_epi16(__4X2(-F_2_562, F_3_072 - F_2_562)), ++ pd_descale_p1 = _mm_set1_epi32(1 << (DESCALE_P1 - 1)), ++ pd_descale_p2 = _mm_set1_epi32(1 << (DESCALE_P2 - 1)); ++ ++ /* Pass 1: process columns */ ++ ++ col0 = VEC_LD(coef_block + 0 * 8); ++ col1 = VEC_LD(coef_block + 1 * 8); ++ col2 = VEC_LD(coef_block + 2 * 8); ++ col3 = VEC_LD(coef_block + 3 * 8); ++ col4 = VEC_LD(coef_block + 4 * 8); ++ col5 = VEC_LD(coef_block + 5 * 8); ++ col6 = VEC_LD(coef_block + 6 * 8); ++ col7 = VEC_LD(coef_block + 7 * 8); ++ ++ tmp1 = _mm_or_si128(col1, col2); ++ tmp2 = _mm_or_si128(col3, col4); ++ tmp1 = _mm_or_si128(tmp1, tmp2); ++ tmp3 = _mm_or_si128(col5, col6); ++ tmp3 = _mm_or_si128(tmp3, col7); ++ tmp1 = _mm_or_si128(tmp1, tmp3); ++ ++ quant0 = VEC_LD(dct_table); ++ col0 = _mm_mullo_epi16(col0, quant0); ++ ++ if (VEC_ISZERO(tmp1)) { ++ /* AC terms all zero */ ++ ++ col0 = _mm_slli_epi16(col0, PASS1_BITS); ++ IDCT_SPLAT8(col0); ++ ++ } else { ++ ++ quant1 = VEC_LD(dct_table + 1 * 8); ++ quant2 = VEC_LD(dct_table + 2 * 8); ++ quant3 = VEC_LD(dct_table + 3 * 8); ++ quant4 = VEC_LD(dct_table + 4 * 8); ++ quant5 = VEC_LD(dct_table + 5 * 8); ++ quant6 = VEC_LD(dct_table + 6 * 8); ++ quant7 = VEC_LD(dct_table + 7 * 8); ++ ++ col1 = _mm_mullo_epi16(col1, quant1); ++ col2 = _mm_mullo_epi16(col2, quant2); ++ col3 = _mm_mullo_epi16(col3, quant3); ++ col4 = _mm_mullo_epi16(col4, quant4); ++ col5 = _mm_mullo_epi16(col5, quant5); ++ col6 = _mm_mullo_epi16(col6, quant6); ++ col7 = _mm_mullo_epi16(col7, quant7); ++ ++ DO_IDCT(col, 1); ++ ++ TRANSPOSE(out, row); ++ } ++ ++ /* Pass 2: process rows */ ++ ++ DO_IDCT(row, 2); ++ ++ IDCT_SAVE(); ++} +diff --git a/simd/e2k/jquantf-e2k.c b/simd/e2k/jquantf-e2k.c +new file mode 100644 +index 0000000..106e99a +--- /dev/null ++++ b/simd/e2k/jquantf-e2k.c +@@ -0,0 +1,121 @@ ++/* ++ * Elbrus optimizations for libjpeg-turbo ++ * ++ * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. ++ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd ++ * ++ * This software is provided 'as-is', without any express or implied ++ * warranty. In no event will the authors be held liable for any damages ++ * arising from the use of this software. ++ * ++ * Permission is granted to anyone to use this software for any purpose, ++ * including commercial applications, and to alter it and redistribute it ++ * freely, subject to the following restrictions: ++ * ++ * 1. The origin of this software must not be misrepresented; you must not ++ * claim that you wrote the original software. If you use this software ++ * in a product, an acknowledgment in the product documentation would be ++ * appreciated but is not required. ++ * 2. Altered source versions must be plainly marked as such, and must not be ++ * misrepresented as being the original software. ++ * 3. This notice may not be removed or altered from any source distribution. ++ */ ++ ++/* FLOAT QUANTIZATION AND SAMPLE CONVERSION */ ++ ++#include "jsimd_e2k.h" ++ ++#define LOAD_ROW(row) in##row = VEC_LD8(sample_data[row] + start_col) ++#define STORE_ROW(i) \ ++ in0 = _mm_unpacklo_epi16(out##i, pb_zero); \ ++ in1 = _mm_unpackhi_epi16(out##i, pb_zero); \ ++ in0 = _mm_sub_epi32(in0, pd_cj); \ ++ in1 = _mm_sub_epi32(in1, pd_cj); \ ++ _mm_storeu_ps(workspace + i * 8, _mm_cvtepi32_ps(in0)); \ ++ _mm_storeu_ps(workspace + i * 8 + 4, _mm_cvtepi32_ps(in1)); ++ ++void jsimd_convsamp_float_e2k(JSAMPARRAY sample_data, JDIMENSION start_col, ++ FAST_FLOAT *workspace) ++{ ++ __m128i in0, in1, in2, in3, in4, in5, in6, in7; ++ __m128i out0, out1, out2, out3, out4, out5, out6, out7; ++ ++ /* Constants */ ++ __m128i pd_cj = _mm_set1_epi32(CENTERJSAMPLE), ++ pb_zero = _mm_setzero_si128(); ++ ++ LOAD_ROW(0); ++ LOAD_ROW(1); ++ LOAD_ROW(2); ++ LOAD_ROW(3); ++ LOAD_ROW(4); ++ LOAD_ROW(5); ++ LOAD_ROW(6); ++ LOAD_ROW(7); ++ ++ out0 = _mm_unpacklo_epi8(in0, pb_zero); ++ out1 = _mm_unpacklo_epi8(in1, pb_zero); ++ out2 = _mm_unpacklo_epi8(in2, pb_zero); ++ out3 = _mm_unpacklo_epi8(in3, pb_zero); ++ out4 = _mm_unpacklo_epi8(in4, pb_zero); ++ out5 = _mm_unpacklo_epi8(in5, pb_zero); ++ out6 = _mm_unpacklo_epi8(in6, pb_zero); ++ out7 = _mm_unpacklo_epi8(in7, pb_zero); ++ ++ STORE_ROW(0) ++ STORE_ROW(1) ++ STORE_ROW(2) ++ STORE_ROW(3) ++ STORE_ROW(4) ++ STORE_ROW(5) ++ STORE_ROW(6) ++ STORE_ROW(7) ++} ++ ++void jsimd_quantize_float_e2k(JCOEFPTR coef_block, FAST_FLOAT *divisors, ++ FAST_FLOAT *workspace) ++{ ++ int i = 0; ++ __m128 row0, row1, row2, row3, recip0, recip1, recip2, recip3; ++ __m128i out0, out1; ++#ifdef JSIMD_SAME_ROUNDING ++ __m128 pd_f16k5 = _mm_set1_ps(16384.5f); ++ __m128i pw_m16k = _mm_set1_epi16(-16384); ++#endif ++ ++ PRAGMA_E2K("ivdep") ++ for (; i < 4; i++, workspace += 16, divisors += 16, coef_block += 16) { ++ row0 = _mm_loadu_ps(workspace + 0 * 4); ++ row1 = _mm_loadu_ps(workspace + 1 * 4); ++ row2 = _mm_loadu_ps(workspace + 2 * 4); ++ row3 = _mm_loadu_ps(workspace + 3 * 4); ++ ++ recip0 = _mm_loadu_ps(divisors + 0 * 4); ++ recip1 = _mm_loadu_ps(divisors + 1 * 4); ++ recip2 = _mm_loadu_ps(divisors + 2 * 4); ++ recip3 = _mm_loadu_ps(divisors + 3 * 4); ++ ++ row0 = _mm_mul_ps(row0, recip0); ++ row1 = _mm_mul_ps(row1, recip1); ++ row2 = _mm_mul_ps(row2, recip2); ++ row3 = _mm_mul_ps(row3, recip3); ++ ++#ifdef JSIMD_SAME_ROUNDING ++ row0 = _mm_add_ps(row0, pd_f16k5); ++ row1 = _mm_add_ps(row1, pd_f16k5); ++ row2 = _mm_add_ps(row2, pd_f16k5); ++ row3 = _mm_add_ps(row3, pd_f16k5); ++ ++ out0 = _mm_packs_epi32(_mm_cvttps_epi32(row0), _mm_cvttps_epi32(row1)); ++ out1 = _mm_packs_epi32(_mm_cvttps_epi32(row2), _mm_cvttps_epi32(row3)); ++ ++ out0 = _mm_add_epi16(out0, pw_m16k); ++ out1 = _mm_add_epi16(out1, pw_m16k); ++#else ++ out0 = _mm_packs_epi32(_mm_cvtps_epi32(row0), _mm_cvtps_epi32(row1)); ++ out1 = _mm_packs_epi32(_mm_cvtps_epi32(row2), _mm_cvtps_epi32(row3)); ++#endif ++ VEC_ST(coef_block, out0); ++ VEC_ST(coef_block + 8, out1); ++ } ++} +diff --git a/simd/e2k/jquanti-e2k.c b/simd/e2k/jquanti-e2k.c +new file mode 100644 +index 0000000..a3e1ff1 +--- /dev/null ++++ b/simd/e2k/jquanti-e2k.c +@@ -0,0 +1,178 @@ ++/* ++ * Elbrus optimizations for libjpeg-turbo ++ * ++ * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. ++ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd ++ * ++ * This software is provided 'as-is', without any express or implied ++ * warranty. In no event will the authors be held liable for any damages ++ * arising from the use of this software. ++ * ++ * Permission is granted to anyone to use this software for any purpose, ++ * including commercial applications, and to alter it and redistribute it ++ * freely, subject to the following restrictions: ++ * ++ * 1. The origin of this software must not be misrepresented; you must not ++ * claim that you wrote the original software. If you use this software ++ * in a product, an acknowledgment in the product documentation would be ++ * appreciated but is not required. ++ * 2. Altered source versions must be plainly marked as such, and must not be ++ * misrepresented as being the original software. ++ * 3. This notice may not be removed or altered from any source distribution. ++ */ ++ ++/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */ ++ ++#include "jsimd_e2k.h" ++ ++#define LOAD_ROW(row) in##row = VEC_LD8(sample_data[row] + start_col) ++ ++void jsimd_convsamp_e2k(JSAMPARRAY sample_data, JDIMENSION start_col, ++ DCTELEM *workspace) ++{ ++ __m128i in0, in1, in2, in3, in4, in5, in6, in7; ++ __m128i out0, out1, out2, out3, out4, out5, out6, out7; ++ ++ /* Constants */ ++ __m128i pw_cj = _mm_set1_epi16(CENTERJSAMPLE), ++ pb_zero = _mm_setzero_si128(); ++ ++ LOAD_ROW(0); ++ LOAD_ROW(1); ++ LOAD_ROW(2); ++ LOAD_ROW(3); ++ LOAD_ROW(4); ++ LOAD_ROW(5); ++ LOAD_ROW(6); ++ LOAD_ROW(7); ++ ++ out0 = _mm_unpacklo_epi8(in0, pb_zero); ++ out1 = _mm_unpacklo_epi8(in1, pb_zero); ++ out2 = _mm_unpacklo_epi8(in2, pb_zero); ++ out3 = _mm_unpacklo_epi8(in3, pb_zero); ++ out4 = _mm_unpacklo_epi8(in4, pb_zero); ++ out5 = _mm_unpacklo_epi8(in5, pb_zero); ++ out6 = _mm_unpacklo_epi8(in6, pb_zero); ++ out7 = _mm_unpacklo_epi8(in7, pb_zero); ++ ++ out0 = _mm_sub_epi16(out0, pw_cj); ++ out1 = _mm_sub_epi16(out1, pw_cj); ++ out2 = _mm_sub_epi16(out2, pw_cj); ++ out3 = _mm_sub_epi16(out3, pw_cj); ++ out4 = _mm_sub_epi16(out4, pw_cj); ++ out5 = _mm_sub_epi16(out5, pw_cj); ++ out6 = _mm_sub_epi16(out6, pw_cj); ++ out7 = _mm_sub_epi16(out7, pw_cj); ++ ++ VEC_ST(workspace + 0 * 8, out0); ++ VEC_ST(workspace + 1 * 8, out1); ++ VEC_ST(workspace + 2 * 8, out2); ++ VEC_ST(workspace + 3 * 8, out3); ++ VEC_ST(workspace + 4 * 8, out4); ++ VEC_ST(workspace + 5 * 8, out5); ++ VEC_ST(workspace + 6 * 8, out6); ++ VEC_ST(workspace + 7 * 8, out7); ++} ++ ++ ++#define WORD_BIT 16 ++#define MULTIPLY(vs0, vs1, out) out = _mm_mulhi_epu16(vs0, vs1) ++ ++void jsimd_quantize_e2k(JCOEFPTR coef_block, DCTELEM *divisors, ++ DCTELEM *workspace) ++{ ++ __m128i row0, row1, row2, row3, row4, row5, row6, row7, ++ row0s, row1s, row2s, row3s, row4s, row5s, row6s, row7s, ++ corr0, corr1, corr2, corr3, corr4, corr5, corr6, corr7, ++ recip0, recip1, recip2, recip3, recip4, recip5, recip6, recip7, ++ scale0, scale1, scale2, scale3, scale4, scale5, scale6, scale7; ++ ++ row0s = VEC_LD(workspace + 0 * 8); ++ row1s = VEC_LD(workspace + 1 * 8); ++ row2s = VEC_LD(workspace + 2 * 8); ++ row3s = VEC_LD(workspace + 3 * 8); ++ row4s = VEC_LD(workspace + 4 * 8); ++ row5s = VEC_LD(workspace + 5 * 8); ++ row6s = VEC_LD(workspace + 6 * 8); ++ row7s = VEC_LD(workspace + 7 * 8); ++ row0 = _mm_abs_epi16(row0s); ++ row1 = _mm_abs_epi16(row1s); ++ row2 = _mm_abs_epi16(row2s); ++ row3 = _mm_abs_epi16(row3s); ++ row4 = _mm_abs_epi16(row4s); ++ row5 = _mm_abs_epi16(row5s); ++ row6 = _mm_abs_epi16(row6s); ++ row7 = _mm_abs_epi16(row7s); ++ ++ corr0 = VEC_LD(divisors + DCTSIZE2 + 0 * 8); ++ corr1 = VEC_LD(divisors + DCTSIZE2 + 1 * 8); ++ corr2 = VEC_LD(divisors + DCTSIZE2 + 2 * 8); ++ corr3 = VEC_LD(divisors + DCTSIZE2 + 3 * 8); ++ corr4 = VEC_LD(divisors + DCTSIZE2 + 4 * 8); ++ corr5 = VEC_LD(divisors + DCTSIZE2 + 5 * 8); ++ corr6 = VEC_LD(divisors + DCTSIZE2 + 6 * 8); ++ corr7 = VEC_LD(divisors + DCTSIZE2 + 7 * 8); ++ ++ row0 = _mm_add_epi16(row0, corr0); ++ row1 = _mm_add_epi16(row1, corr1); ++ row2 = _mm_add_epi16(row2, corr2); ++ row3 = _mm_add_epi16(row3, corr3); ++ row4 = _mm_add_epi16(row4, corr4); ++ row5 = _mm_add_epi16(row5, corr5); ++ row6 = _mm_add_epi16(row6, corr6); ++ row7 = _mm_add_epi16(row7, corr7); ++ ++ recip0 = VEC_LD(divisors + 0 * 8); ++ recip1 = VEC_LD(divisors + 1 * 8); ++ recip2 = VEC_LD(divisors + 2 * 8); ++ recip3 = VEC_LD(divisors + 3 * 8); ++ recip4 = VEC_LD(divisors + 4 * 8); ++ recip5 = VEC_LD(divisors + 5 * 8); ++ recip6 = VEC_LD(divisors + 6 * 8); ++ recip7 = VEC_LD(divisors + 7 * 8); ++ ++ MULTIPLY(row0, recip0, row0); ++ MULTIPLY(row1, recip1, row1); ++ MULTIPLY(row2, recip2, row2); ++ MULTIPLY(row3, recip3, row3); ++ MULTIPLY(row4, recip4, row4); ++ MULTIPLY(row5, recip5, row5); ++ MULTIPLY(row6, recip6, row6); ++ MULTIPLY(row7, recip7, row7); ++ ++ scale0 = VEC_LD(divisors + DCTSIZE2 * 2 + 0 * 8); ++ scale1 = VEC_LD(divisors + DCTSIZE2 * 2 + 1 * 8); ++ scale2 = VEC_LD(divisors + DCTSIZE2 * 2 + 2 * 8); ++ scale3 = VEC_LD(divisors + DCTSIZE2 * 2 + 3 * 8); ++ scale4 = VEC_LD(divisors + DCTSIZE2 * 2 + 4 * 8); ++ scale5 = VEC_LD(divisors + DCTSIZE2 * 2 + 5 * 8); ++ scale6 = VEC_LD(divisors + DCTSIZE2 * 2 + 6 * 8); ++ scale7 = VEC_LD(divisors + DCTSIZE2 * 2 + 7 * 8); ++ ++ MULTIPLY(row0, scale0, row0); ++ MULTIPLY(row1, scale1, row1); ++ MULTIPLY(row2, scale2, row2); ++ MULTIPLY(row3, scale3, row3); ++ MULTIPLY(row4, scale4, row4); ++ MULTIPLY(row5, scale5, row5); ++ MULTIPLY(row6, scale6, row6); ++ MULTIPLY(row7, scale7, row7); ++ ++ row0 = _mm_sign_epi16(row0, row0s); ++ row1 = _mm_sign_epi16(row1, row1s); ++ row2 = _mm_sign_epi16(row2, row2s); ++ row3 = _mm_sign_epi16(row3, row3s); ++ row4 = _mm_sign_epi16(row4, row4s); ++ row5 = _mm_sign_epi16(row5, row5s); ++ row6 = _mm_sign_epi16(row6, row6s); ++ row7 = _mm_sign_epi16(row7, row7s); ++ ++ VEC_ST(coef_block + 0 * 8, row0); ++ VEC_ST(coef_block + 1 * 8, row1); ++ VEC_ST(coef_block + 2 * 8, row2); ++ VEC_ST(coef_block + 3 * 8, row3); ++ VEC_ST(coef_block + 4 * 8, row4); ++ VEC_ST(coef_block + 5 * 8, row5); ++ VEC_ST(coef_block + 6 * 8, row6); ++ VEC_ST(coef_block + 7 * 8, row7); ++} +diff --git a/simd/e2k/jsimd.c b/simd/e2k/jsimd.c +new file mode 100644 +index 0000000..f8c0465 +--- /dev/null ++++ b/simd/e2k/jsimd.c +@@ -0,0 +1,761 @@ ++/* ++ * jsimd_e2k.c ++ * ++ * Copyright 2009 Pierre Ossman for Cendio AB ++ * Copyright (C) 2009-2011, 2014-2016, 2018, D. R. Commander. ++ * Copyright (C) 2015-2016, 2018, Matthieu Darbois. ++ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd ++ * ++ * Based on the x86 SIMD extension for IJG JPEG library, ++ * Copyright (C) 1999-2006, MIYASAKA Masaru. ++ * For conditions of distribution and use, see copyright notice in jsimdext.inc ++ * ++ * This file contains the interface between the "normal" portions ++ * of the library and the SIMD implementations when running on a ++ * PowerPC architecture. ++ */ ++ ++#define JPEG_INTERNALS ++#include "../../jinclude.h" ++#include "../../jpeglib.h" ++#include "../../jsimd.h" ++#include "../../jdct.h" ++#include "../../jsimddct.h" ++#include "../jsimd.h" ++#include "jsimd_api_e2k.h" ++ ++static unsigned int simd_support = ~0; ++static unsigned int simd_huffman = 1; ++ ++/* ++ * Check what SIMD accelerations are supported. ++ * ++ * FIXME: This code is racy under a multi-threaded environment. ++ */ ++LOCAL(void) ++init_simd(void) ++{ ++#ifndef NO_GETENV ++ char *env = NULL; ++#endif ++ ++ if (simd_support != ~0U) ++ return; ++ ++ simd_support = JSIMD_SSE2; ++ ++#ifndef NO_GETENV ++ /* Force different settings through environment variables */ ++ env = getenv("JSIMD_FORCENONE"); ++ if ((env != NULL) && (strcmp(env, "1") == 0)) ++ simd_support = 0; ++ env = getenv("JSIMD_NOHUFFENC"); ++ if ((env != NULL) && (strcmp(env, "1") == 0)) ++ simd_huffman = 0; ++#endif ++} ++ ++static inline int color_space_idx(J_COLOR_SPACE color_space) { ++ switch (color_space) { ++ case JCS_EXT_RGB: ++ return 1 + (EXT_RGB_PIXELSIZE != 3) * 16; ++ case JCS_EXT_RGBX: ++ case JCS_EXT_RGBA: ++ return 2 + (EXT_RGBX_PIXELSIZE != 3) * 16; ++ case JCS_EXT_BGR: ++ return 3 + (EXT_BGR_PIXELSIZE != 3) * 16; ++ case JCS_EXT_BGRX: ++ case JCS_EXT_BGRA: ++ return 4 + (EXT_BGRX_PIXELSIZE != 3) * 16; ++ case JCS_EXT_XBGR: ++ case JCS_EXT_ABGR: ++ return 5 + (EXT_XBGR_PIXELSIZE != 3) * 16; ++ case JCS_EXT_XRGB: ++ case JCS_EXT_ARGB: ++ return 6 + (EXT_XRGB_PIXELSIZE != 3) * 16; ++ default: ++ break; ++ } ++ return 0 + (RGB_PIXELSIZE != 3) * 16; ++} ++ ++GLOBAL(int) ++jsimd_can_rgb_ycc(void) ++{ ++ init_simd(); ++ ++ /* The code is optimised for these values only */ ++ if (BITS_IN_JSAMPLE != 8) ++ return 0; ++ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) ++ return 0; ++ ++ if (simd_support & JSIMD_SSE2) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(int) ++jsimd_can_rgb_gray(void) ++{ ++ init_simd(); ++ ++ /* The code is optimised for these values only */ ++ if (BITS_IN_JSAMPLE != 8) ++ return 0; ++ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) ++ return 0; ++ ++ if (simd_support & JSIMD_SSE2) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(int) ++jsimd_can_ycc_rgb(void) ++{ ++ init_simd(); ++ ++ /* The code is optimised for these values only */ ++ if (BITS_IN_JSAMPLE != 8) ++ return 0; ++ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) ++ return 0; ++ ++ if (simd_support & JSIMD_SSE2) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(int) ++jsimd_can_ycc_rgb565(void) ++{ ++ return 0; ++} ++ ++GLOBAL(void) ++jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, ++ JSAMPIMAGE output_buf, JDIMENSION output_row, ++ int num_rows) ++{ ++ void (*e2kfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int, int); ++ int idx = color_space_idx(cinfo->in_color_space); ++ ++ e2kfct = idx < 16 ? jsimd_rgb3_ycc_convert_e2k : ++ jsimd_rgb4_ycc_convert_e2k; ++ idx &= 15; ++ ++ e2kfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows, idx); ++} ++ ++GLOBAL(void) ++jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, ++ JSAMPIMAGE output_buf, JDIMENSION output_row, ++ int num_rows) ++{ ++ void (*e2kfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int, int); ++ int idx = color_space_idx(cinfo->in_color_space); ++ ++ e2kfct = idx < 16 ? jsimd_rgb3_gray_convert_e2k : ++ jsimd_rgb4_gray_convert_e2k; ++ idx &= 15; ++ ++ e2kfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows, idx); ++} ++ ++GLOBAL(void) ++jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, ++ JDIMENSION input_row, JSAMPARRAY output_buf, ++ int num_rows) ++{ ++ void (*e2kfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int, int); ++ int idx = color_space_idx(cinfo->out_color_space); ++ ++ e2kfct = idx < 16 ? jsimd_ycc_rgb3_convert_e2k : ++ jsimd_ycc_rgb4_convert_e2k; ++ idx &= 15; ++ ++ e2kfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows, idx); ++} ++ ++GLOBAL(void) ++jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, ++ JDIMENSION input_row, JSAMPARRAY output_buf, ++ int num_rows) ++{ ++} ++ ++GLOBAL(int) ++jsimd_can_h2v2_downsample(void) ++{ ++ init_simd(); ++ ++ /* The code is optimised for these values only */ ++ if (BITS_IN_JSAMPLE != 8) ++ return 0; ++ ++ if (simd_support & JSIMD_SSE2) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(int) ++jsimd_can_h2v1_downsample(void) ++{ ++ init_simd(); ++ ++ /* The code is optimised for these values only */ ++ if (BITS_IN_JSAMPLE != 8) ++ return 0; ++ ++ if (simd_support & JSIMD_SSE2) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(void) ++jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, ++ JSAMPARRAY input_data, JSAMPARRAY output_data) ++{ ++ jsimd_h2v2_downsample_e2k(cinfo->image_width, cinfo->max_v_samp_factor, ++ compptr->v_samp_factor, ++ compptr->width_in_blocks, input_data, ++ output_data); ++} ++ ++GLOBAL(void) ++jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, ++ JSAMPARRAY input_data, JSAMPARRAY output_data) ++{ ++ jsimd_h2v1_downsample_e2k(cinfo->image_width, cinfo->max_v_samp_factor, ++ compptr->v_samp_factor, ++ compptr->width_in_blocks, input_data, ++ output_data); ++} ++ ++GLOBAL(int) ++jsimd_can_h2v2_upsample(void) ++{ ++ init_simd(); ++ ++ /* The code is optimised for these values only */ ++ if (BITS_IN_JSAMPLE != 8) ++ return 0; ++ ++ if (simd_support & JSIMD_SSE2) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(int) ++jsimd_can_h2v1_upsample(void) ++{ ++ init_simd(); ++ ++ /* The code is optimised for these values only */ ++ if (BITS_IN_JSAMPLE != 8) ++ return 0; ++ ++ if (simd_support & JSIMD_SSE2) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(void) ++jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, ++ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) ++{ ++ jsimd_h2v2_upsample_e2k(cinfo->max_v_samp_factor, cinfo->output_width, ++ input_data, output_data_ptr); ++} ++ ++GLOBAL(void) ++jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, ++ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) ++{ ++ jsimd_h2v1_upsample_e2k(cinfo->max_v_samp_factor, cinfo->output_width, ++ input_data, output_data_ptr); ++} ++ ++GLOBAL(int) ++jsimd_can_h2v2_fancy_upsample(void) ++{ ++ init_simd(); ++ ++ /* The code is optimised for these values only */ ++ if (BITS_IN_JSAMPLE != 8) ++ return 0; ++ ++ if (simd_support & JSIMD_SSE2) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(int) ++jsimd_can_h2v1_fancy_upsample(void) ++{ ++ init_simd(); ++ ++ /* The code is optimised for these values only */ ++ if (BITS_IN_JSAMPLE != 8) ++ return 0; ++ ++ if (simd_support & JSIMD_SSE2) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(void) ++jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, ++ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) ++{ ++ jsimd_h2v2_fancy_upsample_e2k(cinfo->max_v_samp_factor, ++ compptr->downsampled_width, input_data, ++ output_data_ptr); ++} ++ ++GLOBAL(void) ++jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, ++ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) ++{ ++ jsimd_h2v1_fancy_upsample_e2k(cinfo->max_v_samp_factor, ++ compptr->downsampled_width, input_data, ++ output_data_ptr); ++} ++ ++GLOBAL(int) ++jsimd_can_h2v2_merged_upsample(void) ++{ ++ init_simd(); ++ ++ /* The code is optimised for these values only */ ++ if (BITS_IN_JSAMPLE != 8) ++ return 0; ++ ++ if (simd_support & JSIMD_SSE2) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(int) ++jsimd_can_h2v1_merged_upsample(void) ++{ ++ init_simd(); ++ ++ /* The code is optimised for these values only */ ++ if (BITS_IN_JSAMPLE != 8) ++ return 0; ++ ++ if (simd_support & JSIMD_SSE2) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(void) ++jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, ++ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf) ++{ ++ void (*e2kfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JDIMENSION, JSAMPARRAY, int); ++ int idx = color_space_idx(cinfo->out_color_space); ++ ++ e2kfct = idx < 16 ? jsimd_ycc_rgb3_merged_upsample_e2k : ++ jsimd_ycc_rgb4_merged_upsample_e2k; ++ idx &= 15; ++ ++ e2kfct(cinfo->output_width, input_buf, in_row_group_ctr, ++ in_row_group_ctr * 2, output_buf, idx); ++ e2kfct(cinfo->output_width, input_buf, in_row_group_ctr, ++ in_row_group_ctr * 2 + 1, output_buf + 1, idx); ++} ++ ++GLOBAL(void) ++jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, ++ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf) ++{ ++ void (*e2kfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JDIMENSION, JSAMPARRAY, int); ++ int idx = color_space_idx(cinfo->out_color_space); ++ ++ e2kfct = idx < 16 ? jsimd_ycc_rgb3_merged_upsample_e2k : ++ jsimd_ycc_rgb4_merged_upsample_e2k; ++ idx &= 15; ++ ++ e2kfct(cinfo->output_width, input_buf, in_row_group_ctr, ++ in_row_group_ctr, output_buf, idx); ++} ++ ++GLOBAL(int) ++jsimd_can_convsamp(void) ++{ ++ init_simd(); ++ ++ /* The code is optimised for these values only */ ++ if (DCTSIZE != 8) ++ return 0; ++ if (BITS_IN_JSAMPLE != 8) ++ return 0; ++ if (sizeof(DCTELEM) != 2) ++ return 0; ++ ++ if (simd_support & JSIMD_SSE2) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(int) ++jsimd_can_convsamp_float(void) ++{ ++ init_simd(); ++ ++ /* The code is optimised for these values only */ ++ if (DCTSIZE != 8) ++ return 0; ++ if (BITS_IN_JSAMPLE != 8) ++ return 0; ++ if (sizeof(FAST_FLOAT) != 4) ++ return 0; ++ ++ if (simd_support & JSIMD_SSE2) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(void) ++jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col, ++ DCTELEM *workspace) ++{ ++ jsimd_convsamp_e2k(sample_data, start_col, workspace); ++} ++ ++GLOBAL(void) ++jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col, ++ FAST_FLOAT *workspace) ++{ ++ jsimd_convsamp_float_e2k(sample_data, start_col, workspace); ++} ++ ++GLOBAL(int) ++jsimd_can_fdct_islow(void) ++{ ++ init_simd(); ++ ++ /* The code is optimised for these values only */ ++ if (DCTSIZE != 8) ++ return 0; ++ if (sizeof(DCTELEM) != 2) ++ return 0; ++ ++ if (simd_support & JSIMD_SSE2) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(int) ++jsimd_can_fdct_ifast(void) ++{ ++ init_simd(); ++ ++ /* The code is optimised for these values only */ ++ if (DCTSIZE != 8) ++ return 0; ++ if (sizeof(DCTELEM) != 2) ++ return 0; ++ ++ if (simd_support & JSIMD_SSE2) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(int) ++jsimd_can_fdct_float(void) ++{ ++ init_simd(); ++ ++ /* The code is optimised for these values only */ ++ if (DCTSIZE != 8) ++ return 0; ++ if (sizeof(FAST_FLOAT) != 4) ++ return 0; ++ ++ if (simd_support & JSIMD_SSE2) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(void) ++jsimd_fdct_islow(DCTELEM *data) ++{ ++ jsimd_fdct_islow_e2k(data); ++} ++ ++GLOBAL(void) ++jsimd_fdct_ifast(DCTELEM *data) ++{ ++ jsimd_fdct_ifast_e2k(data); ++} ++ ++GLOBAL(void) ++jsimd_fdct_float(FAST_FLOAT *data) ++{ ++ jsimd_fdct_float_e2k(data); ++} ++ ++GLOBAL(int) ++jsimd_can_quantize(void) ++{ ++ init_simd(); ++ ++ /* The code is optimised for these values only */ ++ if (DCTSIZE != 8) ++ return 0; ++ if (sizeof(JCOEF) != 2) ++ return 0; ++ if (sizeof(DCTELEM) != 2) ++ return 0; ++ ++ if (simd_support & JSIMD_SSE2) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(int) ++jsimd_can_quantize_float(void) ++{ ++ init_simd(); ++ ++ /* The code is optimised for these values only */ ++ if (DCTSIZE != 8) ++ return 0; ++ if (sizeof(JCOEF) != 2) ++ return 0; ++ if (sizeof(FAST_FLOAT) != 4) ++ return 0; ++ ++ if (simd_support & JSIMD_SSE2) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(void) ++jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace) ++{ ++ jsimd_quantize_e2k(coef_block, divisors, workspace); ++} ++ ++GLOBAL(void) ++jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors, ++ FAST_FLOAT *workspace) ++{ ++ jsimd_quantize_float_e2k(coef_block, divisors, workspace); ++} ++ ++GLOBAL(int) ++jsimd_can_idct_2x2(void) ++{ ++ return 0; ++} ++ ++GLOBAL(int) ++jsimd_can_idct_4x4(void) ++{ ++ return 0; ++} ++ ++GLOBAL(void) ++jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr, ++ JCOEFPTR coef_block, JSAMPARRAY output_buf, ++ JDIMENSION output_col) ++{ ++} ++ ++GLOBAL(void) ++jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr, ++ JCOEFPTR coef_block, JSAMPARRAY output_buf, ++ JDIMENSION output_col) ++{ ++} ++ ++GLOBAL(int) ++jsimd_can_idct_islow(void) ++{ ++ init_simd(); ++ ++ /* The code is optimised for these values only */ ++ if (DCTSIZE != 8) ++ return 0; ++ if (sizeof(JCOEF) != 2) ++ return 0; ++ if (BITS_IN_JSAMPLE != 8) ++ return 0; ++ if (sizeof(ISLOW_MULT_TYPE) != 2) ++ return 0; ++ ++ if (simd_support & JSIMD_SSE2) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(int) ++jsimd_can_idct_ifast(void) ++{ ++ init_simd(); ++ ++ /* The code is optimised for these values only */ ++ if (DCTSIZE != 8) ++ return 0; ++ if (sizeof(JCOEF) != 2) ++ return 0; ++ if (BITS_IN_JSAMPLE != 8) ++ return 0; ++ if (sizeof(IFAST_MULT_TYPE) != 2) ++ return 0; ++ if (IFAST_SCALE_BITS != 2) ++ return 0; ++ ++ if (simd_support & JSIMD_SSE2) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(int) ++jsimd_can_idct_float(void) ++{ ++ init_simd(); ++ ++ /* The code is optimised for these values only */ ++ if (DCTSIZE != 8) ++ return 0; ++ if (sizeof(JCOEF) != 2) ++ return 0; ++ if (BITS_IN_JSAMPLE != 8) ++ return 0; ++ if (sizeof(FAST_FLOAT) != 4) ++ return 0; ++ if (sizeof(FLOAT_MULT_TYPE) != 4) ++ return 0; ++ ++ if (simd_support & JSIMD_SSE2) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(void) ++jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr, ++ JCOEFPTR coef_block, JSAMPARRAY output_buf, ++ JDIMENSION output_col) ++{ ++ jsimd_idct_islow_e2k(compptr->dct_table, coef_block, output_buf, ++ output_col); ++} ++ ++GLOBAL(void) ++jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr, ++ JCOEFPTR coef_block, JSAMPARRAY output_buf, ++ JDIMENSION output_col) ++{ ++ jsimd_idct_ifast_e2k(compptr->dct_table, coef_block, output_buf, ++ output_col); ++} ++ ++GLOBAL(void) ++jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr, ++ JCOEFPTR coef_block, JSAMPARRAY output_buf, ++ JDIMENSION output_col) ++{ ++ jsimd_idct_float_e2k(compptr->dct_table, coef_block, output_buf, ++ output_col); ++} ++ ++GLOBAL(int) ++jsimd_can_huff_encode_one_block(void) ++{ ++ init_simd(); ++ ++ if (DCTSIZE != 8) ++ return 0; ++ if (sizeof(JCOEF) != 2) ++ return 0; ++ ++ if ((simd_support & JSIMD_SSE2) && simd_huffman) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(JOCTET *) ++jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block, ++ int last_dc_val, c_derived_tbl *dctbl, ++ c_derived_tbl *actbl) ++{ ++ return jsimd_huff_encode_one_block_e2k(state, buffer, block, last_dc_val, ++ dctbl, actbl); ++} ++ ++GLOBAL(int) ++jsimd_can_encode_mcu_AC_first_prepare(void) ++{ ++ init_simd(); ++ ++ if (DCTSIZE != 8) ++ return 0; ++ if (sizeof(JCOEF) != 2) ++ return 0; ++ if ((simd_support & JSIMD_SSE2) && simd_huffman) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(void) ++jsimd_encode_mcu_AC_first_prepare(const JCOEF *block, ++ const int *jpeg_natural_order_start, int Sl, ++ int Al, UJCOEF *values, size_t *zerobits) ++{ ++ jsimd_encode_mcu_AC_first_prepare_e2k(block, jpeg_natural_order_start, ++ Sl, Al, (JCOEF*)values, zerobits); ++} ++ ++GLOBAL(int) ++jsimd_can_encode_mcu_AC_refine_prepare(void) ++{ ++ init_simd(); ++ ++ if (DCTSIZE != 8) ++ return 0; ++ if (sizeof(JCOEF) != 2) ++ return 0; ++ if ((simd_support & JSIMD_SSE2) && simd_huffman) ++ return 1; ++ ++ return 0; ++} ++ ++GLOBAL(int) ++jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block, ++ const int *jpeg_natural_order_start, int Sl, ++ int Al, UJCOEF *absvalues, size_t *bits) ++{ ++ return jsimd_encode_mcu_AC_refine_prepare_e2k(block, ++ jpeg_natural_order_start, ++ Sl, Al, (JCOEF*)absvalues, bits); ++} +diff --git a/simd/e2k/jsimd_api_e2k.h b/simd/e2k/jsimd_api_e2k.h +new file mode 100644 +index 0000000..d857203 +--- /dev/null ++++ b/simd/e2k/jsimd_api_e2k.h +@@ -0,0 +1,94 @@ ++/* ++ * Elbrus optimizations for libjpeg-turbo ++ * ++ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd ++ * ++ * This software is provided 'as-is', without any express or implied ++ * warranty. In no event will the authors be held liable for any damages ++ * arising from the use of this software. ++ * ++ * Permission is granted to anyone to use this software for any purpose, ++ * including commercial applications, and to alter it and redistribute it ++ * freely, subject to the following restrictions: ++ * ++ * 1. The origin of this software must not be misrepresented; you must not ++ * claim that you wrote the original software. If you use this software ++ * in a product, an acknowledgment in the product documentation would be ++ * appreciated but is not required. ++ * 2. Altered source versions must be plainly marked as such, and must not be ++ * misrepresented as being the original software. ++ * 3. This notice may not be removed or altered from any source distribution. ++ */ ++ ++/* Function declarations */ ++ ++#define CONVERT_DECL(n) \ ++EXTERN(void) jsimd_rgb##n##_ycc_convert_e2k(JDIMENSION img_width, \ ++ JSAMPARRAY input_buf, JSAMPIMAGE output_buf, \ ++ JDIMENSION output_row, int num_rows, int shuf_idx); \ ++EXTERN(void) jsimd_rgb##n##_gray_convert_e2k(JDIMENSION img_width, \ ++ JSAMPARRAY input_buf, JSAMPIMAGE output_buf, \ ++ JDIMENSION output_row, int num_rows, int shuf_idx); \ ++EXTERN(void) jsimd_ycc_rgb##n##_convert_e2k(JDIMENSION out_width, \ ++ JSAMPIMAGE input_buf, JDIMENSION input_row, \ ++ JSAMPARRAY output_buf, int num_rows, int shuf_idx); \ ++EXTERN(void) jsimd_ycc_rgb##n##_convert_e2k(JDIMENSION out_width, \ ++ JSAMPIMAGE input_buf, JDIMENSION input_row, JSAMPARRAY output_buf, \ ++ int num_rows, int shuf_idx); \ ++EXTERN(void) jsimd_ycc_rgb##n##_merged_upsample_e2k(JDIMENSION out_width, \ ++ JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, \ ++ JDIMENSION in_row_group_ctr_y, JSAMPARRAY output_buf, int shuf_idx); \ ++ ++CONVERT_DECL(3) ++CONVERT_DECL(4) ++ ++EXTERN(void) jsimd_h2v1_downsample_e2k ++ (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor, ++ JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data); ++EXTERN(void) jsimd_h2v2_downsample_e2k ++ (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor, ++ JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data); ++ ++#define UPSAMPLE_DECL(name) \ ++EXTERN(void) jsimd_##name##_upsample_e2k \ ++ (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, \ ++ JSAMPARRAY *output_data_ptr); ++ ++UPSAMPLE_DECL(h2v1) ++UPSAMPLE_DECL(h2v2) ++UPSAMPLE_DECL(h2v1_fancy) ++UPSAMPLE_DECL(h2v2_fancy) ++ ++EXTERN(void) jsimd_convsamp_e2k ++ (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace); ++EXTERN(void) jsimd_convsamp_float_e2k ++ (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace); ++ ++EXTERN(void) jsimd_fdct_islow_e2k(DCTELEM *data); ++EXTERN(void) jsimd_fdct_ifast_e2k(DCTELEM *data); ++EXTERN(void) jsimd_fdct_float_e2k(FAST_FLOAT *data); ++EXTERN(void) jsimd_quantize_e2k ++ (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace); ++EXTERN(void) jsimd_quantize_float_e2k ++ (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace); ++EXTERN(void) jsimd_idct_islow_e2k ++ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, ++ JDIMENSION output_col); ++EXTERN(void) jsimd_idct_ifast_e2k ++ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, ++ JDIMENSION output_col); ++EXTERN(void) jsimd_idct_float_e2k ++ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, ++ JDIMENSION output_col); ++ ++EXTERN(JOCTET *) jsimd_huff_encode_one_block_e2k ++ (void *state, JOCTET *buffer, JCOEFPTR block, ++ int last_dc_val, c_derived_tbl *dctbl, c_derived_tbl *actbl); ++ ++EXTERN(void) jsimd_encode_mcu_AC_first_prepare_e2k ++ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, ++ JCOEF *values, size_t *zerobits); ++ ++EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_e2k ++ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, ++ JCOEF *absvalues, size_t *bits); +diff --git a/simd/e2k/jsimd_e2k.h b/simd/e2k/jsimd_e2k.h +new file mode 100644 +index 0000000..15d6262 +--- /dev/null ++++ b/simd/e2k/jsimd_e2k.h +@@ -0,0 +1,207 @@ ++/* ++ * Elbrus optimizations for libjpeg-turbo ++ * ++ * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved. ++ * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd ++ * ++ * This software is provided 'as-is', without any express or implied ++ * warranty. In no event will the authors be held liable for any damages ++ * arising from the use of this software. ++ * ++ * Permission is granted to anyone to use this software for any purpose, ++ * including commercial applications, and to alter it and redistribute it ++ * freely, subject to the following restrictions: ++ * ++ * 1. The origin of this software must not be misrepresented; you must not ++ * claim that you wrote the original software. If you use this software ++ * in a product, an acknowledgment in the product documentation would be ++ * appreciated but is not required. ++ * 2. Altered source versions must be plainly marked as such, and must not be ++ * misrepresented as being the original software. ++ * 3. This notice may not be removed or altered from any source distribution. ++ */ ++ ++#define JPEG_INTERNALS ++#include "../../jinclude.h" ++#include "../../jpeglib.h" ++#include "../../jsimd.h" ++#include "../../jdct.h" ++#include "../../jsimddct.h" ++#include "../jsimd.h" ++#include "jsimd_api_e2k.h" ++#include ++#include /* SSE4.1 */ ++ ++ ++/* Common code */ ++ ++#define __4X2(a, b) a, b, a, b, a, b, a, b ++#define ALWAYS_INLINE __attribute__((__always_inline__)) inline ++ ++#ifdef __e2k__ ++#define PRAGMA_E2K _Pragma ++#define _mm_shuffle2_pi8(a, b, c) \ ++ ((__m64)__builtin_e2k_pshufb((uint64_t)(b), (uint64_t)(a), (uint64_t)(c))) ++#define _mm_shuffle2_epi8(a, b, c) \ ++ ((__m128i)__builtin_e2k_qppermb((__v2di)(b), (__v2di)(a), (__v2di)(c))) ++#define _mm_blendv_pi8(a, b, c) \ ++ ((__m64)__builtin_e2k_pmerge((uint64_t)(a), (uint64_t)(b), (uint64_t)(c))) ++#else ++#define PRAGMA_E2K(x) ++#define _mm_shuffle2_pi8(a, b, c) \ ++ _mm_movepi64_pi64(_mm_shuffle_epi8(_mm_unpacklo_epi64( \ ++ _mm_movpi64_epi64(a), _mm_movpi64_epi64(b)), _mm_movpi64_epi64(c))) ++#define _mm_shuffle2_epi8(a, b, c) \ ++ _mm_blendv_epi8(_mm_shuffle_epi8(a, c), _mm_shuffle_epi8(b, c), \ ++ _mm_slli_epi16(c, 3)) ++#define _mm_blendv_pi8(a, b, c) \ ++ _mm_movepi64_pi64(_mm_blendv_epi8(_mm_movpi64_epi64(a), \ ++ _mm_movpi64_epi64(b), _mm_movpi64_epi64(c))) ++ ++#define BITREV_ROUND(c, i) a = (a & c) << i | (a >> i & c); ++static ALWAYS_INLINE uint64_t __builtin_e2k_bitrevd(uint64_t a) { ++ BITREV_ROUND(0x5555555555555555ll, 1) ++ BITREV_ROUND(0x3333333333333333ll, 2) ++ BITREV_ROUND(0x0F0F0F0F0F0F0F0Fll, 4) ++ BITREV_ROUND(0x00FF00FF00FF00FFll, 8) ++ BITREV_ROUND(0x0000FFFF0000FFFFll, 16) ++ return a >> 32 | a << 32; ++} ++ ++static ALWAYS_INLINE uint64_t __builtin_e2k_insfd(uint64_t a, uint64_t b, uint64_t c) { ++ int n = b & 63; ++ a = a >> n | a << (64 - n); ++ return c ^ ((a ^ c) & (~0ll << (b >> 6 & 63))); ++} ++#endif ++ ++#if defined(__iset__) && __iset__ >= 5 ++static ALWAYS_INLINE __m128i _mm_packhi_epi32(__m128i a, __m128i b) { ++ __m128i index = _mm_setr_epi8( ++ 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31); ++ return _mm_shuffle2_epi8(a, b, index); ++} ++ ++#define VEC_ISZERO(a) !_mm_cvtsi128_si64(_mm_packs_epi16(a, a)) ++#else ++static ALWAYS_INLINE __m128i _mm_packhi_epi32(__m128i a, __m128i b) { ++ union { __m128i v; __m64 d[2]; } l = { a }, h = { b }, x; ++ __m64 index = _mm_setr_pi8(2, 3, 6, 7, 10, 11, 14, 15); ++ x.d[0] = _mm_shuffle2_pi8(l.d[0], l.d[1], index); ++ x.d[1] = _mm_shuffle2_pi8(h.d[0], h.d[1], index); ++ return x.v; ++} ++ ++static ALWAYS_INLINE uint64_t vec_isnonzero(__m128i a) { ++ __v2di x = (__v2di)a; ++ return x[0] | x[1]; ++} ++ ++#define VEC_ISZERO(a) !vec_isnonzero(a) ++#endif ++ ++#define VEC_ALIGNR8(a, b) _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b), _mm_castsi128_pd(a), 1)) ++ ++#define TRANSPOSE_FLOAT(a, b, c, d, e, f, g, h) \ ++ tmp0 = _mm_unpacklo_ps(a, b); \ ++ tmp1 = _mm_unpackhi_ps(a, b); \ ++ tmp2 = _mm_unpacklo_ps(c, d); \ ++ tmp3 = _mm_unpackhi_ps(c, d); \ ++ e = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); \ ++ f = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); \ ++ g = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); \ ++ h = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); ++ ++#define TRANSPOSE8(a, b) \ ++ b##0 = _mm_unpacklo_epi8(a##0, a##2); \ ++ b##1 = _mm_unpackhi_epi8(a##0, a##2); \ ++ b##2 = _mm_unpacklo_epi8(a##1, a##3); \ ++ b##3 = _mm_unpackhi_epi8(a##1, a##3); ++ ++#define TRANSPOSE16(a, b) \ ++ b##0 = _mm_unpacklo_epi16(a##0, a##2); \ ++ b##1 = _mm_unpackhi_epi16(a##0, a##2); \ ++ b##2 = _mm_unpacklo_epi16(a##1, a##3); \ ++ b##3 = _mm_unpackhi_epi16(a##1, a##3); \ ++ b##4 = _mm_unpacklo_epi16(a##4, a##6); \ ++ b##5 = _mm_unpackhi_epi16(a##4, a##6); \ ++ b##6 = _mm_unpacklo_epi16(a##5, a##7); \ ++ b##7 = _mm_unpackhi_epi16(a##5, a##7); ++ ++#define TRANSPOSE(a, b) \ ++ TRANSPOSE16(a, b) TRANSPOSE16(b, a) \ ++ b##0 = _mm_unpacklo_epi64(a##0, a##4); \ ++ b##1 = _mm_unpackhi_epi64(a##0, a##4); \ ++ b##2 = _mm_unpacklo_epi64(a##1, a##5); \ ++ b##3 = _mm_unpackhi_epi64(a##1, a##5); \ ++ b##4 = _mm_unpacklo_epi64(a##2, a##6); \ ++ b##5 = _mm_unpackhi_epi64(a##2, a##6); \ ++ b##6 = _mm_unpacklo_epi64(a##3, a##7); \ ++ b##7 = _mm_unpackhi_epi64(a##3, a##7); ++ ++#define IDCT_SAVE() { \ ++ __m128i pb_cj = _mm_set1_epi8((int8_t)CENTERJSAMPLE); \ ++ \ ++ row0 = _mm_xor_si128(_mm_packs_epi16(out0, out1), pb_cj); \ ++ row1 = _mm_xor_si128(_mm_packs_epi16(out2, out3), pb_cj); \ ++ row2 = _mm_xor_si128(_mm_packs_epi16(out4, out5), pb_cj); \ ++ row3 = _mm_xor_si128(_mm_packs_epi16(out6, out7), pb_cj); \ ++ \ ++ TRANSPOSE8(row, col) TRANSPOSE8(col, row) TRANSPOSE8(row, col) \ ++ \ ++ VEC_STL(output_buf[0] + output_col, col0); \ ++ VEC_STH(output_buf[1] + output_col, col0); \ ++ VEC_STL(output_buf[2] + output_col, col1); \ ++ VEC_STH(output_buf[3] + output_col, col1); \ ++ VEC_STL(output_buf[4] + output_col, col2); \ ++ VEC_STH(output_buf[5] + output_col, col2); \ ++ VEC_STL(output_buf[6] + output_col, col3); \ ++ VEC_STH(output_buf[7] + output_col, col3); \ ++} ++ ++#define IDCT_SPLAT8(col0) { \ ++ row3 = _mm_unpacklo_epi16(col0, col0); \ ++ row7 = _mm_unpackhi_epi16(col0, col0); \ ++ row1 = _mm_unpacklo_epi16(row3, row3); \ ++ row3 = _mm_unpackhi_epi16(row3, row3); \ ++ row5 = _mm_unpacklo_epi16(row7, row7); \ ++ row7 = _mm_unpackhi_epi16(row7, row7); \ ++ row0 = _mm_unpacklo_epi64(row1, row1); \ ++ row1 = _mm_unpackhi_epi64(row1, row1); \ ++ row2 = _mm_unpacklo_epi64(row3, row3); \ ++ row3 = _mm_unpackhi_epi64(row3, row3); \ ++ row4 = _mm_unpacklo_epi64(row5, row5); \ ++ row5 = _mm_unpackhi_epi64(row5, row5); \ ++ row6 = _mm_unpacklo_epi64(row7, row7); \ ++ row7 = _mm_unpackhi_epi64(row7, row7); \ ++} ++ ++#ifndef min ++#define min(a, b) ((a) < (b) ? (a) : (b)) ++#endif ++ ++#define VEC_LD(a) _mm_loadu_si128((const __m128i*)(a)) ++#define VEC_ST(a, b) _mm_storeu_si128((__m128i*)(a), b) ++#define VEC_LD8(a) _mm_loadl_epi64((const __m128i*)(a)) ++#define VEC_STL(a, b) _mm_storel_epi64((__m128i*)(a), b) ++#define VEC_STH(a, b) _mm_storeh_pd((double*)(a), _mm_castsi128_pd(b)); ++#define VEC_SPLAT(v, i) _mm_shuffle_epi8(v, _mm_set1_epi16((i) * 2 | ((i) * 2 + 1) << 8)) ++ ++#if !defined(__iset__) || __iset__ < 5 ++#define NEED_ALIGN8 ++#define ALIGN8_COMMON uint64_t src_shr; __m64 src_tmp0, src_tmp1; ++#define ALIGN8_VARS(src) __m64 *src##_ptr, src##_next, src##_index; ++#define ALIGN8_START(ptr, src) \ ++ src_shr = (intptr_t)(ptr - 1) & 7; \ ++ src##_ptr = (__m64*)((intptr_t)(ptr - 1) & -8); \ ++ src##_next = src##_ptr[src_shr == 7]; \ ++ src##_index = _mm_add_pi8(_mm_set1_pi8(src_shr), \ ++ _mm_setr_pi8(1, 2, 3, 4, 5, 6, 7, 8)); ++#define ALIGN8_READ16(v0, src, i) \ ++ src_tmp1 = src##_ptr[i * 2 + 1]; \ ++ src_tmp0 = _mm_shuffle2_pi8(src##_next, src_tmp1, src##_index); \ ++ src##_next = src##_ptr[i * 2 + 2]; \ ++ src_tmp1 = _mm_shuffle2_pi8(src_tmp1, src##_next, src##_index); \ ++ v0 = _mm_setr_epi64(src_tmp0, src_tmp1); ++#endif ++ +-- +2.45.0 + From a60a5fbd8c5a917d250db1cd70077235a38aa48b Mon Sep 17 00:00:00 2001 From: Mikhail Novosyolov Date: Sun, 28 Jul 2024 13:35:14 +0300 Subject: [PATCH 30/32] better url --- .abf.yml | 2 +- libjpeg-turbo.spec | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.abf.yml b/.abf.yml index 3b105b8..9ac1e2d 100644 --- a/.abf.yml +++ b/.abf.yml @@ -1,2 +1,2 @@ sources: - 3.0.3.tar.gz: 397a31222105129c9e798efce2459c445048546e + libjpeg-turbo-3.0.3.tar.gz: 397a31222105129c9e798efce2459c445048546e diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec index ebcd6c1..57137d1 100644 --- a/libjpeg-turbo.spec +++ b/libjpeg-turbo.spec @@ -13,11 +13,11 @@ Summary: A MMX/SSE2 accelerated library for manipulating JPEG image files Name: libjpeg-turbo Epoch: 1 Version: 3.0.3 -Release: 1 +Release: 2 License: wxWindows Library License Group: Graphics Url: http://www.libjpeg-turbo.org -Source0: https://github.com/libjpeg-turbo/libjpeg-turbo/archive/refs/tags/%{version}.tar.gz +Source0: https://github.com/libjpeg-turbo/libjpeg-turbo/archive/refs/tags/%{version}.tar.gz?/%{name}-%{version}.tar.gz # These two allow automatic lossless rotation of JPEG images from a digital # camera which have orientation markings in the EXIF data. After rotation # the orientation markings are reset to avoid duplicate rotation when From 51d9a93eb1cc714986d39185d21da25c2948dca1 Mon Sep 17 00:00:00 2001 From: Mikhail Novosyolov Date: Sun, 28 Jul 2024 13:38:19 +0300 Subject: [PATCH 31/32] clean up metadata --- libjpeg-turbo.spec | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec index 57137d1..4cc0c51 100644 --- a/libjpeg-turbo.spec +++ b/libjpeg-turbo.spec @@ -134,8 +134,8 @@ library. %package -n jpeg-progs Summary: Programs for manipulating JPEG format image files Group: Graphics -%rename libjpeg-progs -%rename jpeg6-progs +Provides: libjpeg-progs = %{EVRD} +Provides: jpeg6-progs = %{EVRD} %description -n jpeg-progs This package contains simple client programs for accessing the From f22ee2c14cd35b0de1815195746ed7dc9c7cd917 Mon Sep 17 00:00:00 2001 From: Mikhail Novosyolov Date: Sun, 28 Jul 2024 13:44:03 +0300 Subject: [PATCH 32/32] compat with Fedora/RH --- libjpeg-turbo.spec | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec index 4cc0c51..a549330 100644 --- a/libjpeg-turbo.spec +++ b/libjpeg-turbo.spec @@ -32,13 +32,23 @@ BuildRequires: libtool >= 1.4 BuildRequires: nasm %endif +# meta package +Recommends: %{libname} = %{EVRD} +Recommends: %{libturbo} = %{EVRD} +Recommends: %{libname62} = %{EVRD} + %description -This package contains a library of functions for manipulating JPEG images. +This meta package pulls packages with libraries of functions for manipulating JPEG images. It is a high-speed, libjpeg-compatible version for x86 and x86-64 processors which uses SIMD instructions (MMX, SSE2, etc.) to accelerate baseline JPEG compression and decompression. It is generally 2-4x as fast as the unmodified version of libjpeg, all else being equal. +%files +# empty, meta package +# exists for compatibility with Fedora/RH +# cnrdrvcups-ufr2-uk (Canon driver) depends from it + #---------------------------------------------------------------------------- %package -n %{libname}