From ff8aca6fe38933d063cae3cbda1de5ce69424aeb Mon Sep 17 00:00:00 2001
From: Giovanni Mariani <mc2374@mclink.it>
Date: Mon, 5 Mar 2018 12:13:56 +0100
Subject: [PATCH 01/32] Updated to release 1.5.3 and added P0 to fix
 CVE-2017-15232

---
 .abf.yml                                 |  2 +-
 libjpeg-turbo-1.5.2-CVE-2017-15232.patch | 53 ++++++++++++++++++++++++
 libjpeg-turbo.spec                       | 29 ++++++++-----
 3 files changed, 72 insertions(+), 12 deletions(-)
 create mode 100644 libjpeg-turbo-1.5.2-CVE-2017-15232.patch

diff --git a/.abf.yml b/.abf.yml
index 08f063d..ce19d84 100644
--- a/.abf.yml
+++ b/.abf.yml
@@ -1,2 +1,2 @@
 sources:
-  libjpeg-turbo-1.5.0.tar.gz: 9adc21b927e48e4c6889e77079f6c1f3eecf98ab
+  libjpeg-turbo-1.5.3.tar.gz: 87ebf4cab2bb27fcb8e7ccb18ec4eb680e1f2c2d
diff --git a/libjpeg-turbo-1.5.2-CVE-2017-15232.patch b/libjpeg-turbo-1.5.2-CVE-2017-15232.patch
new file mode 100644
index 0000000..f67b1bf
--- /dev/null
+++ b/libjpeg-turbo-1.5.2-CVE-2017-15232.patch
@@ -0,0 +1,53 @@
+From 1ecd9a5729d78518397889a630e3534bd9d963a8 Mon Sep 17 00:00:00 2001
+From: Kornel <kornel@geekhood.net>
+Date: Sat, 30 Sep 2017 12:05:53 +0100
+Subject: [PATCH] Handle NULL buffer when discarding rows
+
+---
+ jdpostct.c | 5 +++++
+ jquant1.c  | 4 ++++
+ 2 files changed, 9 insertions(+)
+
+diff --git a/jdpostct.c b/jdpostct.c
+index 601fc2a79..a24202ca9 100644
+--- a/jdpostct.c
++++ b/jdpostct.c
+@@ -132,6 +132,11 @@ post_process_1pass (j_decompress_ptr cinfo,
+   my_post_ptr post = (my_post_ptr) cinfo->post;
+   JDIMENSION num_rows, max_rows;
+ 
++  /* read_and_discard_scanlines may call it with rows "available", but no buffer */
++  if (output_buf == NULL) {
++    return;
++  }
++
+   /* Fill the buffer, but not more than what we can dump out in one go. */
+   /* Note we rely on the upsampler to detect bottom of image. */
+   max_rows = out_rows_avail - *out_row_ctr;
+diff --git a/jquant1.c b/jquant1.c
+index e7814815e..ba2ea9b80 100644
+--- a/jquant1.c
++++ b/jquant1.c
+@@ -531,6 +531,10 @@ quantize_ord_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+   JDIMENSION col;
+   JDIMENSION width = cinfo->output_width;
+ 
++  if (output_buf == NULL && num_rows) {
++    ERREXIT(cinfo, JERR_BAD_PARAM);
++  }
++
+   for (row = 0; row < num_rows; row++) {
+     /* Initialize output values to 0 so can process components separately */
+     jzero_far((void *) output_buf[row], (size_t) (width * sizeof(JSAMPLE)));
+Index: libjpeg-turbo-1.5.2/jerror.h
+===================================================================
+--- libjpeg-turbo-1.5.2.orig/jerror.h	2017-07-07 22:31:10.000000000 +0200
++++ libjpeg-turbo-1.5.2/jerror.h	2017-10-12 13:24:01.349954012 +0200
+@@ -208,6 +208,7 @@ JMESSAGE(JERR_NO_ARITH_TABLE, "Arithmeti
+ JMESSAGE(JWRN_ARITH_BAD_CODE, "Corrupt JPEG data: bad arithmetic code")
+ #endif
+ #endif
++JMESSAGE(JERR_BAD_PARAM, "Bogus parameter")
+ 
+ #ifdef JMAKE_ENUM_LIST
+ 
diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec
index a9c528a..e2d9ff5 100644
--- a/libjpeg-turbo.spec
+++ b/libjpeg-turbo.spec
@@ -1,20 +1,20 @@
-%define major 8
-%define libname %mklibname jpeg %{major}
-%define devname %mklibname jpeg -d
-%define sdevname %mklibname jpeg -d -s
+%define	major 8
+%define	libname %mklibname jpeg %{major}
+%define	devname %mklibname jpeg -d
+%define	sdevname %mklibname jpeg -d -s
 
-%define majorturbo 0
-%define libturbo %mklibname turbojpeg %{majorturbo}
+%define	majorturbo 0
+%define	libturbo %mklibname turbojpeg %{majorturbo}
 
-%define major62 62
-%define libname62 %mklibname jpeg %{major62}
+%define	major62 62
+%define	libname62 %mklibname jpeg %{major62}
 
 Summary:	A MMX/SSE2 accelerated library for manipulating JPEG image files
 Name:		libjpeg-turbo
 Epoch:		1
-Version:	1.5.0
-Release:	3
-License:	wxWidgets Library License
+Version:	1.5.3
+Release:	1
+License:	wxWindows Library License
 Group:		System/Libraries
 Url:		http://www.libjpeg-turbo.org
 Source0:	https://sourceforge.net/projects/libjpeg-turbo/files/%{version}/%{name}-%{version}.tar.gz
@@ -25,6 +25,7 @@ Source0:	https://sourceforge.net/projects/libjpeg-turbo/files/%{version}/%{name}
 Source2:	http://jpegclub.org/jpegexiforient.c
 Source3:	http://jpegclub.org/exifautotran.txt
 Patch0:		jpeg-6b-c++fixes.patch
+Patch1:		%{name}-1.5.2-CVE-2017-15232.patch
 BuildRequires:	libtool >= 1.4
 %ifarch %{ix86} x86_64
 BuildRequires:	nasm
@@ -62,6 +63,7 @@ This package contains the library needed to run programs dynamically linked
 with libjpeg.
 
 %files -n %{libname62}
+%doc LICENSE.md
 %{_libdir}/libjpeg.so.%{major62}*
 
 #----------------------------------------------------------------------------
@@ -75,6 +77,7 @@ This package contains the library needed to run programs dynamically linked
 with libturbojpeg.
 
 %files -n %{libturbo}
+%doc LICENSE.md
 %{_libdir}/libturbojpeg.so.%{majorturbo}*
 
 #----------------------------------------------------------------------------
@@ -120,6 +123,7 @@ for developing programs which will manipulate JPEG files using the libjpeg
 library.
 
 %files -n %{sdevname}
+%doc LICENSE.md
 %{_libdir}/libjpeg.a
 %{_libdir}/libturbojpeg.a
 
@@ -154,10 +158,12 @@ have orientation markings in the EXIF data.
 %prep
 %setup -q
 %patch0 -p0
+%patch1 -p1
 
 cp %{SOURCE2} jpegexiforient.c
 cp %{SOURCE3} exifautotran
 
+
 %build
 CONFIGURE_TOP="$PWD"
 
@@ -182,6 +188,7 @@ popd
 
 %__cc %{optflags} %{ldflags} -o jpegexiforient jpegexiforient.c
 
+
 %install
 make install-libLTLIBRARIES DESTDIR=%{buildroot} -C jpeg62
 %makeinstall_std -C jpeg8

From d96ac9ad59604eba10ab63900fd05550cd0a9d9f Mon Sep 17 00:00:00 2001
From: Giovanni Mariani <mc2374@mclink.it>
Date: Wed, 1 Aug 2018 16:27:39 +0200
Subject: [PATCH 02/32] Updated to release 2.0.0, rediffed P1 and added P2 to
 fix library path, fixed file lists

---
 .abf.yml                                 |  2 +-
 libjpeg-turbo-1.5.2-CVE-2017-15232.patch | 53 ------------------------
 libjpeg-turbo-2.0.0-CVE-2017-15232.patch | 40 ++++++++++++++++++
 libjpeg-turbo-2.0.0-fix-lib-path.patch   | 12 ++++++
 libjpeg-turbo.spec                       | 35 +++++++++-------
 5 files changed, 73 insertions(+), 69 deletions(-)
 delete mode 100644 libjpeg-turbo-1.5.2-CVE-2017-15232.patch
 create mode 100644 libjpeg-turbo-2.0.0-CVE-2017-15232.patch
 create mode 100644 libjpeg-turbo-2.0.0-fix-lib-path.patch

diff --git a/.abf.yml b/.abf.yml
index ce19d84..51872f1 100644
--- a/.abf.yml
+++ b/.abf.yml
@@ -1,2 +1,2 @@
 sources:
-  libjpeg-turbo-1.5.3.tar.gz: 87ebf4cab2bb27fcb8e7ccb18ec4eb680e1f2c2d
+  libjpeg-turbo-2.0.0.tar.gz: fe49aea935617748c21ecbe46c986d6c1b98f39b
diff --git a/libjpeg-turbo-1.5.2-CVE-2017-15232.patch b/libjpeg-turbo-1.5.2-CVE-2017-15232.patch
deleted file mode 100644
index f67b1bf..0000000
--- a/libjpeg-turbo-1.5.2-CVE-2017-15232.patch
+++ /dev/null
@@ -1,53 +0,0 @@
-From 1ecd9a5729d78518397889a630e3534bd9d963a8 Mon Sep 17 00:00:00 2001
-From: Kornel <kornel@geekhood.net>
-Date: Sat, 30 Sep 2017 12:05:53 +0100
-Subject: [PATCH] Handle NULL buffer when discarding rows
-
----
- jdpostct.c | 5 +++++
- jquant1.c  | 4 ++++
- 2 files changed, 9 insertions(+)
-
-diff --git a/jdpostct.c b/jdpostct.c
-index 601fc2a79..a24202ca9 100644
---- a/jdpostct.c
-+++ b/jdpostct.c
-@@ -132,6 +132,11 @@ post_process_1pass (j_decompress_ptr cinfo,
-   my_post_ptr post = (my_post_ptr) cinfo->post;
-   JDIMENSION num_rows, max_rows;
- 
-+  /* read_and_discard_scanlines may call it with rows "available", but no buffer */
-+  if (output_buf == NULL) {
-+    return;
-+  }
-+
-   /* Fill the buffer, but not more than what we can dump out in one go. */
-   /* Note we rely on the upsampler to detect bottom of image. */
-   max_rows = out_rows_avail - *out_row_ctr;
-diff --git a/jquant1.c b/jquant1.c
-index e7814815e..ba2ea9b80 100644
---- a/jquant1.c
-+++ b/jquant1.c
-@@ -531,6 +531,10 @@ quantize_ord_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
-   JDIMENSION col;
-   JDIMENSION width = cinfo->output_width;
- 
-+  if (output_buf == NULL && num_rows) {
-+    ERREXIT(cinfo, JERR_BAD_PARAM);
-+  }
-+
-   for (row = 0; row < num_rows; row++) {
-     /* Initialize output values to 0 so can process components separately */
-     jzero_far((void *) output_buf[row], (size_t) (width * sizeof(JSAMPLE)));
-Index: libjpeg-turbo-1.5.2/jerror.h
-===================================================================
---- libjpeg-turbo-1.5.2.orig/jerror.h	2017-07-07 22:31:10.000000000 +0200
-+++ libjpeg-turbo-1.5.2/jerror.h	2017-10-12 13:24:01.349954012 +0200
-@@ -208,6 +208,7 @@ JMESSAGE(JERR_NO_ARITH_TABLE, "Arithmeti
- JMESSAGE(JWRN_ARITH_BAD_CODE, "Corrupt JPEG data: bad arithmetic code")
- #endif
- #endif
-+JMESSAGE(JERR_BAD_PARAM, "Bogus parameter")
- 
- #ifdef JMAKE_ENUM_LIST
- 
diff --git a/libjpeg-turbo-2.0.0-CVE-2017-15232.patch b/libjpeg-turbo-2.0.0-CVE-2017-15232.patch
new file mode 100644
index 0000000..22cb61b
--- /dev/null
+++ b/libjpeg-turbo-2.0.0-CVE-2017-15232.patch
@@ -0,0 +1,40 @@
+diff -rupN libjpeg-turbo-2.0.0.old/jdpostct.c libjpeg-turbo-2.0.0/jdpostct.c
+--- libjpeg-turbo-2.0.0.old/jdpostct.c	2018-07-27 18:47:48.000000000 +0200
++++ libjpeg-turbo-2.0.0/jdpostct.c	2018-08-01 16:10:01.647692751 +0200
+@@ -137,6 +137,11 @@ post_process_1pass(j_decompress_ptr cinf
+   my_post_ptr post = (my_post_ptr)cinfo->post;
+   JDIMENSION num_rows, max_rows;
+ 
++  /* read_and_discard_scanlines may call it with rows "available", but no buffer */
++  if (output_buf == NULL) {
++    return;
++  }
++
+   /* Fill the buffer, but not more than what we can dump out in one go. */
+   /* Note we rely on the upsampler to detect bottom of image. */
+   max_rows = out_rows_avail - *out_row_ctr;
+diff -rupN libjpeg-turbo-2.0.0.old/jerror.h libjpeg-turbo-2.0.0/jerror.h
+--- libjpeg-turbo-2.0.0.old/jerror.h	2018-07-27 18:47:48.000000000 +0200
++++ libjpeg-turbo-2.0.0/jerror.h	2018-08-01 16:11:31.554054434 +0200
+@@ -207,6 +207,7 @@ JMESSAGE(JWRN_ARITH_BAD_CODE, "Corrupt J
+ #endif
+ #endif
+ JMESSAGE(JWRN_BOGUS_ICC, "Corrupt JPEG data: bad ICC marker")
++JMESSAGE(JERR_BAD_PARAM, "Bogus parameter")
+ 
+ #ifdef JMAKE_ENUM_LIST
+ 
+diff -rupN libjpeg-turbo-2.0.0.old/jquant1.c libjpeg-turbo-2.0.0/jquant1.c
+--- libjpeg-turbo-2.0.0.old/jquant1.c	2018-07-27 18:47:48.000000000 +0200
++++ libjpeg-turbo-2.0.0/jquant1.c	2018-08-01 16:12:44.437913930 +0200
+@@ -532,6 +532,10 @@ quantize_ord_dither(j_decompress_ptr cin
+   JDIMENSION col;
+   JDIMENSION width = cinfo->output_width;
+ 
++  if (output_buf == NULL && num_rows) {
++    ERREXIT(cinfo, JERR_BAD_PARAM);
++  }  
++
+   for (row = 0; row < num_rows; row++) {
+     /* Initialize output values to 0 so can process components separately */
+     jzero_far((void *)output_buf[row], (size_t)(width * sizeof(JSAMPLE)));
diff --git a/libjpeg-turbo-2.0.0-fix-lib-path.patch b/libjpeg-turbo-2.0.0-fix-lib-path.patch
new file mode 100644
index 0000000..d0ba187
--- /dev/null
+++ b/libjpeg-turbo-2.0.0-fix-lib-path.patch
@@ -0,0 +1,12 @@
+diff -rupN libjpeg-turbo-2.0.0.old/CMakeLists.txt libjpeg-turbo-2.0.0/CMakeLists.txt
+--- libjpeg-turbo-2.0.0.old/CMakeLists.txt	2018-07-27 18:47:48.000000000 +0200
++++ libjpeg-turbo-2.0.0/CMakeLists.txt	2018-08-01 16:24:47.625312813 +0200
+@@ -102,7 +102,7 @@ if(CMAKE_INSTALL_PREFIX STREQUAL "${CMAK
+     if(BITS EQUAL 64)
+       set(CMAKE_INSTALL_DEFAULT_LIBDIR "lib64")
+     else()
+-      set(CMAKE_INSTALL_DEFAULT_LIBDIR "lib32")
++      set(CMAKE_INSTALL_DEFAULT_LIBDIR "lib")
+     endif()
+   endif()
+ endif()
diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec
index e2d9ff5..9e12857 100644
--- a/libjpeg-turbo.spec
+++ b/libjpeg-turbo.spec
@@ -12,7 +12,7 @@
 Summary:	A MMX/SSE2 accelerated library for manipulating JPEG image files
 Name:		libjpeg-turbo
 Epoch:		1
-Version:	1.5.3
+Version:	2.0.0
 Release:	1
 License:	wxWindows Library License
 Group:		System/Libraries
@@ -25,7 +25,9 @@ Source0:	https://sourceforge.net/projects/libjpeg-turbo/files/%{version}/%{name}
 Source2:	http://jpegclub.org/jpegexiforient.c
 Source3:	http://jpegclub.org/exifautotran.txt
 Patch0:		jpeg-6b-c++fixes.patch
-Patch1:		%{name}-1.5.2-CVE-2017-15232.patch
+Patch1:		%{name}-2.0.0-CVE-2017-15232.patch
+Patch2:		%{name}-2.0.0-fix-lib-path.patch
+BuildRequires:	cmake
 BuildRequires:	libtool >= 1.4
 %ifarch %{ix86} x86_64
 BuildRequires:	nasm
@@ -99,7 +101,7 @@ developing programs which will manipulate JPEG files using the libjpeg
 library.
 
 %files -n %{devname}
-%doc coderules.txt example.c jconfig.txt libjpeg.txt structure.txt
+%doc coderules.txt jconfig.txt libjpeg.txt structure.txt
 %{_libdir}/libjpeg.so
 %{_libdir}/libturbojpeg.so
 %{_includedir}/*.h
@@ -159,39 +161,42 @@ have orientation markings in the EXIF data.
 %setup -q
 %patch0 -p0
 %patch1 -p1
+%patch2 -p1
 
 cp %{SOURCE2} jpegexiforient.c
 cp %{SOURCE3} exifautotran
 
 
 %build
-CONFIGURE_TOP="$PWD"
-
+# Prepare build dirs
 mkdir -p jpeg8
+mkdir -p jpeg62
+
+# Build jpeg v8 API
 pushd jpeg8
 CFLAGS="%{optflags} -Ofast -funroll-loops" \
-%configure2_5x \
-	--enable-shared \
-	--enable-static \
-	--with-jpeg8
+%cmake ../.. -DWITH_JPEG8="True"
+
 %make
 popd
 
-mkdir -p jpeg62
+# Build jpeg v6.2 API
 pushd jpeg62
 CFLAGS="%{optflags} -Ofast -funroll-loops" \
-%configure2_5x \
-	--enable-shared \
-	--disable-static
+%cmake	../.. \
+		-DWITH_ARITH_DEC="True" \
+		-DWITH_ARITH_ENC="True"
+
 %make
 popd
 
+# Build jpegexiforient binary
 %__cc %{optflags} %{ldflags} -o jpegexiforient jpegexiforient.c
 
 
 %install
-make install-libLTLIBRARIES DESTDIR=%{buildroot} -C jpeg62
-%makeinstall_std -C jpeg8
+%makeinstall_std -C jpeg8/build
+%makeinstall_std -C jpeg62/build
 
 install -m755 jpegexiforient -D %{buildroot}%{_bindir}/jpegexiforient
 install -m755 exifautotran -D %{buildroot}%{_bindir}/exifautotran

From 9b6ee512d6232b9ea08f6210fec8240984a854b0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BB=D0=B5=D0=BA=D1=81=D0=B5=D0=B9-=D0=97?=
 <a.zimin@rosalinux.ru>
Date: Sun, 12 Aug 2018 11:57:01 +0300
Subject: [PATCH 03/32] Added requires %{libname62} in %{devname}

---
 libjpeg-turbo.spec | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec
index 9e12857..9a3e9cc 100644
--- a/libjpeg-turbo.spec
+++ b/libjpeg-turbo.spec
@@ -13,7 +13,7 @@ Summary:	A MMX/SSE2 accelerated library for manipulating JPEG image files
 Name:		libjpeg-turbo
 Epoch:		1
 Version:	2.0.0
-Release:	1
+Release:	2
 License:	wxWindows Library License
 Group:		System/Libraries
 Url:		http://www.libjpeg-turbo.org
@@ -88,12 +88,12 @@ with libturbojpeg.
 Summary:	Development tools for programs which will use the libjpeg library
 Group:		Development/C
 Requires:	%{libname} = %{EVRD}
+Requires:	%{libname62} = %{EVRD}
 Requires:	%{libturbo} = %{EVRD}
 Provides:	jpeg-devel = %{EVRD}
 Conflicts:	jpeg6-devel
 Conflicts:	%{_lib}turbojpeg < 1:1.3.0
 Obsoletes:	%{_lib}turbojpeg < 1:1.3.0
-Obsoletes:	%{mklibname jpeg 62 -d} < 6b-45
 
 %description -n %{devname}
 The libjpeg-turbo devel package includes the header files necessary for

From 10ceb7e23e0e8acc7c970dcb96b498ce1d4adc85 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BB=D0=B7=D0=B8=D0=BC?= <a.zimin@rosalinux.ru>
Date: Tue, 13 Nov 2018 00:45:03 +0300
Subject: [PATCH 04/32] Updated to 2.0.1

---
 .abf.yml           |  2 +-
 libjpeg-turbo.spec | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/.abf.yml b/.abf.yml
index 51872f1..ebd867b 100644
--- a/.abf.yml
+++ b/.abf.yml
@@ -1,2 +1,2 @@
 sources:
-  libjpeg-turbo-2.0.0.tar.gz: fe49aea935617748c21ecbe46c986d6c1b98f39b
+  libjpeg-turbo-2.0.1.tar.gz: 7ea4a288bccbb5a2d5bfad5fb328d4a839853f4e
diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec
index 9a3e9cc..1224f2d 100644
--- a/libjpeg-turbo.spec
+++ b/libjpeg-turbo.spec
@@ -1,7 +1,7 @@
-%define	major 8
-%define	libname %mklibname jpeg %{major}
-%define	devname %mklibname jpeg -d
-%define	sdevname %mklibname jpeg -d -s
+%define major 8
+%define libname %mklibname jpeg %{major}
+%define devname %mklibname jpeg -d
+%define sdevname %mklibname jpeg -d -s
 
 %define	majorturbo 0
 %define	libturbo %mklibname turbojpeg %{majorturbo}
@@ -12,8 +12,8 @@
 Summary:	A MMX/SSE2 accelerated library for manipulating JPEG image files
 Name:		libjpeg-turbo
 Epoch:		1
-Version:	2.0.0
-Release:	2
+Version:	2.0.1
+Release:	1
 License:	wxWindows Library License
 Group:		System/Libraries
 Url:		http://www.libjpeg-turbo.org
@@ -177,7 +177,7 @@ pushd jpeg8
 CFLAGS="%{optflags} -Ofast -funroll-loops" \
 %cmake ../.. -DWITH_JPEG8="True"
 
-%make
+%make -s
 popd
 
 # Build jpeg v6.2 API
@@ -187,7 +187,7 @@ CFLAGS="%{optflags} -Ofast -funroll-loops" \
 		-DWITH_ARITH_DEC="True" \
 		-DWITH_ARITH_ENC="True"
 
-%make
+%make -s
 popd
 
 # Build jpegexiforient binary

From 79a3e80f5ad4030b12293a0e0cc0854f5622aedf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BB=D0=B7=D0=B8=D0=BC?= <a.zimin@rosalinux.ru>
Date: Thu, 14 Feb 2019 13:20:22 +0300
Subject: [PATCH 05/32] Updated to 2.0.2

---
 .abf.yml           | 4 +++-
 libjpeg-turbo.spec | 6 +++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/.abf.yml b/.abf.yml
index ebd867b..55b3dbf 100644
--- a/.abf.yml
+++ b/.abf.yml
@@ -1,2 +1,4 @@
-sources:
+removed_sources:
   libjpeg-turbo-2.0.1.tar.gz: 7ea4a288bccbb5a2d5bfad5fb328d4a839853f4e
+sources:
+  libjpeg-turbo-2.0.2.tar.gz: 1cff52d50b81755d0bdcf9055eb22157f39a1695
diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec
index 1224f2d..cc00bb8 100644
--- a/libjpeg-turbo.spec
+++ b/libjpeg-turbo.spec
@@ -12,7 +12,7 @@
 Summary:	A MMX/SSE2 accelerated library for manipulating JPEG image files
 Name:		libjpeg-turbo
 Epoch:		1
-Version:	2.0.1
+Version:	2.0.2
 Release:	1
 License:	wxWindows Library License
 Group:		System/Libraries
@@ -184,8 +184,8 @@ popd
 pushd jpeg62
 CFLAGS="%{optflags} -Ofast -funroll-loops" \
 %cmake	../.. \
-		-DWITH_ARITH_DEC="True" \
-		-DWITH_ARITH_ENC="True"
+	-DWITH_ARITH_DEC="True" \
+	-DWITH_ARITH_ENC="True"
 
 %make -s
 popd

From 2c3dd24fe0bc4a0bc8d5d08f1e6fd9513e0ee885 Mon Sep 17 00:00:00 2001
From: Andrey Bondrov <andrey.bondrov@rosalab.ru>
Date: Thu, 4 Apr 2019 17:15:44 +1000
Subject: [PATCH 06/32] Bump release

---
 libjpeg-turbo.spec | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec
index cc00bb8..95de94c 100644
--- a/libjpeg-turbo.spec
+++ b/libjpeg-turbo.spec
@@ -13,7 +13,7 @@ Summary:	A MMX/SSE2 accelerated library for manipulating JPEG image files
 Name:		libjpeg-turbo
 Epoch:		1
 Version:	2.0.2
-Release:	1
+Release:	2
 License:	wxWindows Library License
 Group:		System/Libraries
 Url:		http://www.libjpeg-turbo.org

From fe3e4ab0bc9ebac814db1af7305660a1ee4d5172 Mon Sep 17 00:00:00 2001
From: Andrey Bondrov <andrey.bondrov@rosalab.ru>
Date: Thu, 4 Apr 2019 20:56:28 +1000
Subject: [PATCH 07/32] Bump release

---
 libjpeg-turbo.spec | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec
index 95de94c..d5163de 100644
--- a/libjpeg-turbo.spec
+++ b/libjpeg-turbo.spec
@@ -13,7 +13,7 @@ Summary:	A MMX/SSE2 accelerated library for manipulating JPEG image files
 Name:		libjpeg-turbo
 Epoch:		1
 Version:	2.0.2
-Release:	2
+Release:	3
 License:	wxWindows Library License
 Group:		System/Libraries
 Url:		http://www.libjpeg-turbo.org

From efdc76f37fa713f3a757e163419e2a747a672283 Mon Sep 17 00:00:00 2001
From: Mikhail Novosyolov <m.novosyolov@rosalinux.ru>
Date: Sat, 6 Apr 2019 23:53:10 +0300
Subject: [PATCH 08/32] Rebuild with rpm-5.4.10-86 where pkgconfig private
 dependencies are parsed (RB#9569) (rebuilding dependency chain of qt4)

---
 libjpeg-turbo.spec | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec
index d5163de..90717e9 100644
--- a/libjpeg-turbo.spec
+++ b/libjpeg-turbo.spec
@@ -13,7 +13,7 @@ Summary:	A MMX/SSE2 accelerated library for manipulating JPEG image files
 Name:		libjpeg-turbo
 Epoch:		1
 Version:	2.0.2
-Release:	3
+Release:	4
 License:	wxWindows Library License
 Group:		System/Libraries
 Url:		http://www.libjpeg-turbo.org

From ee29827a05f3429c5e071398fae27458761c6e98 Mon Sep 17 00:00:00 2001
From:  <alexander@mezon.ru>
Date: Sun, 7 Apr 2019 10:28:09 +0000
Subject: [PATCH 09/32] MassBuild#1666: Increase release tag

---
 libjpeg-turbo.spec | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec
index 90717e9..567ff2d 100644
--- a/libjpeg-turbo.spec
+++ b/libjpeg-turbo.spec
@@ -13,7 +13,7 @@ Summary:	A MMX/SSE2 accelerated library for manipulating JPEG image files
 Name:		libjpeg-turbo
 Epoch:		1
 Version:	2.0.2
-Release:	4
+Release:	5
 License:	wxWindows Library License
 Group:		System/Libraries
 Url:		http://www.libjpeg-turbo.org

From daad40f0943c37257eb4b5a47660b815b25ffccf Mon Sep 17 00:00:00 2001
From: Andrey Bondrov <andrey.bondrov@rosalab.ru>
Date: Sun, 14 Apr 2019 01:46:46 +0000
Subject: [PATCH 10/32] MassBuild#1671: Increase release tag

---
 libjpeg-turbo.spec | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec
index 567ff2d..1a09bed 100644
--- a/libjpeg-turbo.spec
+++ b/libjpeg-turbo.spec
@@ -13,7 +13,7 @@ Summary:	A MMX/SSE2 accelerated library for manipulating JPEG image files
 Name:		libjpeg-turbo
 Epoch:		1
 Version:	2.0.2
-Release:	5
+Release:	6
 License:	wxWindows Library License
 Group:		System/Libraries
 Url:		http://www.libjpeg-turbo.org

From 06a4c164ab9d6a354d85303b0d9994b0cbd11d46 Mon Sep 17 00:00:00 2001
From:  <alexander@mezon.ru>
Date: Sat, 6 Jul 2019 18:20:51 +0000
Subject: [PATCH 11/32] MassBuild#1715: Increase release tag

---
 libjpeg-turbo.spec | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec
index 1a09bed..19a5028 100644
--- a/libjpeg-turbo.spec
+++ b/libjpeg-turbo.spec
@@ -13,7 +13,7 @@ Summary:	A MMX/SSE2 accelerated library for manipulating JPEG image files
 Name:		libjpeg-turbo
 Epoch:		1
 Version:	2.0.2
-Release:	6
+Release:	7
 License:	wxWindows Library License
 Group:		System/Libraries
 Url:		http://www.libjpeg-turbo.org

From b5784d7862cb268d45236ba747543283b0cbbeeb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BB=D0=B7=D0=B8=D0=BC?= <a.zimin@rosalinux.ru>
Date: Mon, 9 Sep 2019 22:32:08 +0300
Subject: [PATCH 12/32] Updated to 2.0.3

---
 .abf.yml           | 4 +---
 libjpeg-turbo.spec | 4 ++--
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/.abf.yml b/.abf.yml
index 55b3dbf..b87805f 100644
--- a/.abf.yml
+++ b/.abf.yml
@@ -1,4 +1,2 @@
-removed_sources:
-  libjpeg-turbo-2.0.1.tar.gz: 7ea4a288bccbb5a2d5bfad5fb328d4a839853f4e
 sources:
-  libjpeg-turbo-2.0.2.tar.gz: 1cff52d50b81755d0bdcf9055eb22157f39a1695
+  libjpeg-turbo-2.0.3.tar.gz: 539363a444f92421c098a1a3e7cebfda48d4cfb3
diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec
index 19a5028..a22e3ab 100644
--- a/libjpeg-turbo.spec
+++ b/libjpeg-turbo.spec
@@ -12,8 +12,8 @@
 Summary:	A MMX/SSE2 accelerated library for manipulating JPEG image files
 Name:		libjpeg-turbo
 Epoch:		1
-Version:	2.0.2
-Release:	7
+Version:	2.0.3
+Release:	1
 License:	wxWindows Library License
 Group:		System/Libraries
 Url:		http://www.libjpeg-turbo.org

From 68b40169300fb55b3808714f69926cc10ea02162 Mon Sep 17 00:00:00 2001
From: Andrey Bondrov <andrey.bondrov@rosalab.ru>
Date: Thu, 14 Nov 2019 16:18:23 +1000
Subject: [PATCH 13/32] Add patch to fix CVE-2019-2201

---
 libjpeg-turbo-2.0.3-CVE-2019-2201.patch | 54 +++++++++++++++++++++++++
 libjpeg-turbo.spec                      |  8 ++--
 2 files changed, 59 insertions(+), 3 deletions(-)
 create mode 100644 libjpeg-turbo-2.0.3-CVE-2019-2201.patch

diff --git a/libjpeg-turbo-2.0.3-CVE-2019-2201.patch b/libjpeg-turbo-2.0.3-CVE-2019-2201.patch
new file mode 100644
index 0000000..78483a7
--- /dev/null
+++ b/libjpeg-turbo-2.0.3-CVE-2019-2201.patch
@@ -0,0 +1,54 @@
+From 6548377e7ed7653e555dac0b9faa4152b00ecbaf Mon Sep 17 00:00:00 2001
+From: DRC <information@libjpeg-turbo.org>
+Date: Tue, 12 Nov 2019 12:11:54 -0800
+Subject: [PATCH] 64-bit tjbench: Fix signed int overflow/segfault
+
+... that occurred when attempting to decompress images with more than
+715827882 (2048*1024*1024 / 3) pixels.
+
+Fixes #388
+---
+ ChangeLog.md | 4 ++++
+ tjbench.c    | 4 ++--
+ 2 files changed, 6 insertions(+), 2 deletions(-)
+
+diff --git a/ChangeLog.md b/ChangeLog.md
+index f5fe44b..4e501c5 100644
+--- a/ChangeLog.md
++++ b/ChangeLog.md
+@@ -41,6 +41,10 @@ end of a single-scan (non-progressive) image, subsequent calls to
+ JPEG images that were compressed with a sampling factor other than 1 (for
+ instance, with `cjpeg -grayscale -sample 2x2`).
+ 
++2. Fixed a signed integer overflow and subsequent segfault that occurred when
++attempting to decompress images with more than 715827882 pixels using the
++64-bit C version of TJBench.
++
+ 
+ 1.5.2
+ =====
+diff --git a/tjbench.c b/tjbench.c
+index 76b61cd..393c014 100644
+--- a/tjbench.c
++++ b/tjbench.c
+@@ -133,7 +133,7 @@ int decomp(unsigned char *srcbuf, unsigned char **jpegbuf,
+ 	}
+ 	/* Set the destination buffer to gray so we know whether the decompressor
+ 	   attempted to write to it */
+-	memset(dstbuf, 127, pitch*scaledh);
++	memset(dstbuf, 127, (size_t)pitch * scaledh);
+ 
+ 	if(doyuv)
+ 	{
+@@ -152,7 +152,7 @@ int decomp(unsigned char *srcbuf, unsigned char **jpegbuf,
+ 	{
+ 		int tile=0;
+ 		double start=gettime();
+-		for(row=0, dstptr=dstbuf; row<ntilesh; row++, dstptr+=pitch*tileh)
++		for(row=0, dstptr=dstbuf; row<ntilesh; row++, dstptr+=(size_t)pitch*tileh)
+ 		{
+ 			for(col=0, dstptr2=dstptr; col<ntilesw; col++, tile++, dstptr2+=ps*tilew)
+ 			{
+-- 
+2.24.0
+
diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec
index a22e3ab..0da4f1d 100644
--- a/libjpeg-turbo.spec
+++ b/libjpeg-turbo.spec
@@ -13,7 +13,7 @@ Summary:	A MMX/SSE2 accelerated library for manipulating JPEG image files
 Name:		libjpeg-turbo
 Epoch:		1
 Version:	2.0.3
-Release:	1
+Release:	2
 License:	wxWindows Library License
 Group:		System/Libraries
 Url:		http://www.libjpeg-turbo.org
@@ -25,8 +25,9 @@ Source0:	https://sourceforge.net/projects/libjpeg-turbo/files/%{version}/%{name}
 Source2:	http://jpegclub.org/jpegexiforient.c
 Source3:	http://jpegclub.org/exifautotran.txt
 Patch0:		jpeg-6b-c++fixes.patch
-Patch1:		%{name}-2.0.0-CVE-2017-15232.patch
-Patch2:		%{name}-2.0.0-fix-lib-path.patch
+Patch1:		libjpeg-turbo-2.0.0-CVE-2017-15232.patch
+Patch2:		libjpeg-turbo-2.0.0-fix-lib-path.patch
+Patch3:		libjpeg-turbo-2.0.3-CVE-2019-2201.patch
 BuildRequires:	cmake
 BuildRequires:	libtool >= 1.4
 %ifarch %{ix86} x86_64
@@ -162,6 +163,7 @@ have orientation markings in the EXIF data.
 %patch0 -p0
 %patch1 -p1
 %patch2 -p1
+%patch3 -p1
 
 cp %{SOURCE2} jpegexiforient.c
 cp %{SOURCE3} exifautotran

From a6178c6cc27cdc59b50d784c8693b2cafd775651 Mon Sep 17 00:00:00 2001
From: Andrey Bondrov <andrey.bondrov@rosalab.ru>
Date: Thu, 14 Nov 2019 16:30:23 +1000
Subject: [PATCH 14/32] Use fix for CVE-2019-2201 for this version, not for
 1.5.3

---
 libjpeg-turbo-2.0.3-CVE-2019-2201.patch | 53 ++++++++++++-------------
 1 file changed, 25 insertions(+), 28 deletions(-)

diff --git a/libjpeg-turbo-2.0.3-CVE-2019-2201.patch b/libjpeg-turbo-2.0.3-CVE-2019-2201.patch
index 78483a7..6cc70b7 100644
--- a/libjpeg-turbo-2.0.3-CVE-2019-2201.patch
+++ b/libjpeg-turbo-2.0.3-CVE-2019-2201.patch
@@ -1,6 +1,6 @@
-From 6548377e7ed7653e555dac0b9faa4152b00ecbaf Mon Sep 17 00:00:00 2001
+From c30b1e72dac76343ef9029833d1561de07d29bad Mon Sep 17 00:00:00 2001
 From: DRC <information@libjpeg-turbo.org>
-Date: Tue, 12 Nov 2019 12:11:54 -0800
+Date: Tue, 12 Nov 2019 12:27:22 -0600
 Subject: [PATCH] 64-bit tjbench: Fix signed int overflow/segfault
 
 ... that occurred when attempting to decompress images with more than
@@ -13,42 +13,39 @@ Fixes #388
  2 files changed, 6 insertions(+), 2 deletions(-)
 
 diff --git a/ChangeLog.md b/ChangeLog.md
-index f5fe44b..4e501c5 100644
+index 4d6960f3d..da160f0f1 100644
 --- a/ChangeLog.md
 +++ b/ChangeLog.md
-@@ -41,6 +41,10 @@ end of a single-scan (non-progressive) image, subsequent calls to
- JPEG images that were compressed with a sampling factor other than 1 (for
- instance, with `cjpeg -grayscale -sample 2x2`).
+@@ -8,6 +8,10 @@
+ 64-bit libjpeg-turbo SDK for Visual C++ were installed on the same system, only
+ one of them could be uninstalled.
  
 +2. Fixed a signed integer overflow and subsequent segfault that occurred when
 +attempting to decompress images with more than 715827882 pixels using the
 +64-bit C version of TJBench.
 +
  
- 1.5.2
+ 2.0.3
  =====
 diff --git a/tjbench.c b/tjbench.c
-index 76b61cd..393c014 100644
+index a7d397318..13a5bde62 100644
 --- a/tjbench.c
 +++ b/tjbench.c
-@@ -133,7 +133,7 @@ int decomp(unsigned char *srcbuf, unsigned char **jpegbuf,
- 	}
- 	/* Set the destination buffer to gray so we know whether the decompressor
- 	   attempted to write to it */
--	memset(dstbuf, 127, pitch*scaledh);
-+	memset(dstbuf, 127, (size_t)pitch * scaledh);
+@@ -171,7 +171,7 @@ static int decomp(unsigned char *srcBuf, unsigned char **jpegBuf,
+   }
+   /* Set the destination buffer to gray so we know whether the decompressor
+      attempted to write to it */
+-  memset(dstBuf, 127, pitch * scaledh);
++  memset(dstBuf, 127, (size_t)pitch * scaledh);
  
- 	if(doyuv)
- 	{
-@@ -152,7 +152,7 @@ int decomp(unsigned char *srcbuf, unsigned char **jpegbuf,
- 	{
- 		int tile=0;
- 		double start=gettime();
--		for(row=0, dstptr=dstbuf; row<ntilesh; row++, dstptr+=pitch*tileh)
-+		for(row=0, dstptr=dstbuf; row<ntilesh; row++, dstptr+=(size_t)pitch*tileh)
- 		{
- 			for(col=0, dstptr2=dstptr; col<ntilesw; col++, tile++, dstptr2+=ps*tilew)
- 			{
--- 
-2.24.0
-
+   if (doYUV) {
+     int width = doTile ? tilew : scaledw;
+@@ -193,7 +193,7 @@ static int decomp(unsigned char *srcBuf, unsigned char **jpegBuf,
+     double start = getTime();
+ 
+     for (row = 0, dstPtr = dstBuf; row < ntilesh;
+-         row++, dstPtr += pitch * tileh) {
++         row++, dstPtr += (size_t)pitch * tileh) {
+       for (col = 0, dstPtr2 = dstPtr; col < ntilesw;
+            col++, tile++, dstPtr2 += ps * tilew) {
+         int width = doTile ? min(tilew, w - col * tilew) : scaledw;

From 396e16b89337c804b5a688892fef66ad26d2646a Mon Sep 17 00:00:00 2001
From: NixTux Commit Bot <noreply@nixtux.ru>
Date: Sat, 4 Jan 2020 04:18:41 +0300
Subject: [PATCH 15/32] bot: rpm5 -> rpm4 (1)

---
 libjpeg-turbo.spec | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec
index 0da4f1d..88dbdee 100644
--- a/libjpeg-turbo.spec
+++ b/libjpeg-turbo.spec
@@ -179,7 +179,7 @@ pushd jpeg8
 CFLAGS="%{optflags} -Ofast -funroll-loops" \
 %cmake ../.. -DWITH_JPEG8="True"
 
-%make -s
+%make_build -s
 popd
 
 # Build jpeg v6.2 API
@@ -189,7 +189,7 @@ CFLAGS="%{optflags} -Ofast -funroll-loops" \
 	-DWITH_ARITH_DEC="True" \
 	-DWITH_ARITH_ENC="True"
 
-%make -s
+%make_build -s
 popd
 
 # Build jpegexiforient binary
@@ -197,8 +197,8 @@ popd
 
 
 %install
-%makeinstall_std -C jpeg8/build
-%makeinstall_std -C jpeg62/build
+%make_install -C jpeg8/build
+%make_install -C jpeg62/build
 
 install -m755 jpegexiforient -D %{buildroot}%{_bindir}/jpegexiforient
 install -m755 exifautotran -D %{buildroot}%{_bindir}/exifautotran

From 1468df33daddd7b3e93fe656de3a7b86bc79f95a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D0=BB=D0=B7=D0=B8=D0=BC?= <a.zimin@rosalinux.ru>
Date: Tue, 7 Jan 2020 21:48:40 +0300
Subject: [PATCH 16/32] Updated to 2.0.4

---
 .abf.yml                                |  2 +-
 libjpeg-turbo-2.0.3-CVE-2019-2201.patch | 51 -------------------------
 libjpeg-turbo.spec                      |  6 +--
 3 files changed, 3 insertions(+), 56 deletions(-)
 delete mode 100644 libjpeg-turbo-2.0.3-CVE-2019-2201.patch

diff --git a/.abf.yml b/.abf.yml
index b87805f..67c52a0 100644
--- a/.abf.yml
+++ b/.abf.yml
@@ -1,2 +1,2 @@
 sources:
-  libjpeg-turbo-2.0.3.tar.gz: 539363a444f92421c098a1a3e7cebfda48d4cfb3
+  libjpeg-turbo-2.0.4.tar.gz: 163d8f96d0999526a117de0388624241b54dcd67
diff --git a/libjpeg-turbo-2.0.3-CVE-2019-2201.patch b/libjpeg-turbo-2.0.3-CVE-2019-2201.patch
deleted file mode 100644
index 6cc70b7..0000000
--- a/libjpeg-turbo-2.0.3-CVE-2019-2201.patch
+++ /dev/null
@@ -1,51 +0,0 @@
-From c30b1e72dac76343ef9029833d1561de07d29bad Mon Sep 17 00:00:00 2001
-From: DRC <information@libjpeg-turbo.org>
-Date: Tue, 12 Nov 2019 12:27:22 -0600
-Subject: [PATCH] 64-bit tjbench: Fix signed int overflow/segfault
-
-... that occurred when attempting to decompress images with more than
-715827882 (2048*1024*1024 / 3) pixels.
-
-Fixes #388
----
- ChangeLog.md | 4 ++++
- tjbench.c    | 4 ++--
- 2 files changed, 6 insertions(+), 2 deletions(-)
-
-diff --git a/ChangeLog.md b/ChangeLog.md
-index 4d6960f3d..da160f0f1 100644
---- a/ChangeLog.md
-+++ b/ChangeLog.md
-@@ -8,6 +8,10 @@
- 64-bit libjpeg-turbo SDK for Visual C++ were installed on the same system, only
- one of them could be uninstalled.
- 
-+2. Fixed a signed integer overflow and subsequent segfault that occurred when
-+attempting to decompress images with more than 715827882 pixels using the
-+64-bit C version of TJBench.
-+
- 
- 2.0.3
- =====
-diff --git a/tjbench.c b/tjbench.c
-index a7d397318..13a5bde62 100644
---- a/tjbench.c
-+++ b/tjbench.c
-@@ -171,7 +171,7 @@ static int decomp(unsigned char *srcBuf, unsigned char **jpegBuf,
-   }
-   /* Set the destination buffer to gray so we know whether the decompressor
-      attempted to write to it */
--  memset(dstBuf, 127, pitch * scaledh);
-+  memset(dstBuf, 127, (size_t)pitch * scaledh);
- 
-   if (doYUV) {
-     int width = doTile ? tilew : scaledw;
-@@ -193,7 +193,7 @@ static int decomp(unsigned char *srcBuf, unsigned char **jpegBuf,
-     double start = getTime();
- 
-     for (row = 0, dstPtr = dstBuf; row < ntilesh;
--         row++, dstPtr += pitch * tileh) {
-+         row++, dstPtr += (size_t)pitch * tileh) {
-       for (col = 0, dstPtr2 = dstPtr; col < ntilesw;
-            col++, tile++, dstPtr2 += ps * tilew) {
-         int width = doTile ? min(tilew, w - col * tilew) : scaledw;
diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec
index 88dbdee..2818af0 100644
--- a/libjpeg-turbo.spec
+++ b/libjpeg-turbo.spec
@@ -12,8 +12,8 @@
 Summary:	A MMX/SSE2 accelerated library for manipulating JPEG image files
 Name:		libjpeg-turbo
 Epoch:		1
-Version:	2.0.3
-Release:	2
+Version:	2.0.4
+Release:	1
 License:	wxWindows Library License
 Group:		System/Libraries
 Url:		http://www.libjpeg-turbo.org
@@ -27,7 +27,6 @@ Source3:	http://jpegclub.org/exifautotran.txt
 Patch0:		jpeg-6b-c++fixes.patch
 Patch1:		libjpeg-turbo-2.0.0-CVE-2017-15232.patch
 Patch2:		libjpeg-turbo-2.0.0-fix-lib-path.patch
-Patch3:		libjpeg-turbo-2.0.3-CVE-2019-2201.patch
 BuildRequires:	cmake
 BuildRequires:	libtool >= 1.4
 %ifarch %{ix86} x86_64
@@ -163,7 +162,6 @@ have orientation markings in the EXIF data.
 %patch0 -p0
 %patch1 -p1
 %patch2 -p1
-%patch3 -p1
 
 cp %{SOURCE2} jpegexiforient.c
 cp %{SOURCE3} exifautotran

From 7196447d0787551b13098c5ad51a3c66fc78af6e Mon Sep 17 00:00:00 2001
From: NixTux Commit Bot <noreply@nixtux.ru>
Date: Mon, 13 Jan 2020 15:46:22 +0300
Subject: [PATCH 17/32] bot: rpm5 -> rpm4 (8)

---
 libjpeg-turbo.spec | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec
index 2818af0..872cce0 100644
--- a/libjpeg-turbo.spec
+++ b/libjpeg-turbo.spec
@@ -177,7 +177,7 @@ pushd jpeg8
 CFLAGS="%{optflags} -Ofast -funroll-loops" \
 %cmake ../.. -DWITH_JPEG8="True"
 
-%make_build -s
+%make -s
 popd
 
 # Build jpeg v6.2 API
@@ -187,7 +187,7 @@ CFLAGS="%{optflags} -Ofast -funroll-loops" \
 	-DWITH_ARITH_DEC="True" \
 	-DWITH_ARITH_ENC="True"
 
-%make_build -s
+%make -s
 popd
 
 # Build jpegexiforient binary
@@ -195,8 +195,8 @@ popd
 
 
 %install
-%make_install -C jpeg8/build
-%make_install -C jpeg62/build
+%makeinstall_std -C jpeg8/build
+%makeinstall_std -C jpeg62/build
 
 install -m755 jpegexiforient -D %{buildroot}%{_bindir}/jpegexiforient
 install -m755 exifautotran -D %{buildroot}%{_bindir}/exifautotran

From 19d61b646de522fbf20f74d4ce679fad6b5b4dbe Mon Sep 17 00:00:00 2001
From: Alzim <a.zimin@rosalinux.ru>
Date: Wed, 24 Jun 2020 10:08:57 +0300
Subject: [PATCH 18/32] Updated to 2.0.5

---
 .abf.yml           | 2 +-
 libjpeg-turbo.spec | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.abf.yml b/.abf.yml
index 67c52a0..952eb71 100644
--- a/.abf.yml
+++ b/.abf.yml
@@ -1,2 +1,2 @@
 sources:
-  libjpeg-turbo-2.0.4.tar.gz: 163d8f96d0999526a117de0388624241b54dcd67
+  libjpeg-turbo-2.0.5.tar.gz: 9d4c565d402b2f5661be78d76098073ec7e30f10
diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec
index 872cce0..d01e602 100644
--- a/libjpeg-turbo.spec
+++ b/libjpeg-turbo.spec
@@ -12,7 +12,7 @@
 Summary:	A MMX/SSE2 accelerated library for manipulating JPEG image files
 Name:		libjpeg-turbo
 Epoch:		1
-Version:	2.0.4
+Version:	2.0.5
 Release:	1
 License:	wxWindows Library License
 Group:		System/Libraries

From eaf934059314f191eb157e704b512f91f288d5e0 Mon Sep 17 00:00:00 2001
From: Alexander Stefanov <alexander@mezon.ru>
Date: Wed, 18 Nov 2020 20:32:40 +0000
Subject: [PATCH 19/32] version autoupdate [2.0.6]

---
 .abf.yml           | 2 +-
 libjpeg-turbo.spec | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.abf.yml b/.abf.yml
index 952eb71..41506b4 100644
--- a/.abf.yml
+++ b/.abf.yml
@@ -1,2 +1,2 @@
 sources:
-  libjpeg-turbo-2.0.5.tar.gz: 9d4c565d402b2f5661be78d76098073ec7e30f10
+  libjpeg-turbo-2.0.6.tar.gz: 5406c7676d7df89fb4da791ad5af51202910fb25
diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec
index d01e602..9575536 100644
--- a/libjpeg-turbo.spec
+++ b/libjpeg-turbo.spec
@@ -12,7 +12,7 @@
 Summary:	A MMX/SSE2 accelerated library for manipulating JPEG image files
 Name:		libjpeg-turbo
 Epoch:		1
-Version:	2.0.5
+Version:	2.0.6
 Release:	1
 License:	wxWindows Library License
 Group:		System/Libraries

From c4ad17884483776110cb6ddfec22dbb48d1096ed Mon Sep 17 00:00:00 2001
From: Alexander Stefanov <alexander@mezon.ru>
Date: Sun, 25 Apr 2021 15:15:23 +0000
Subject: [PATCH 20/32] 2.1.0

---
 .abf.yml                                 |  2 +-
 libjpeg-turbo-2.0.0-CVE-2017-15232.patch | 40 ------------------------
 libjpeg-turbo-2.0.0-fix-lib-path.patch   | 12 -------
 libjpeg-turbo.spec                       |  7 ++---
 4 files changed, 3 insertions(+), 58 deletions(-)
 delete mode 100644 libjpeg-turbo-2.0.0-CVE-2017-15232.patch
 delete mode 100644 libjpeg-turbo-2.0.0-fix-lib-path.patch

diff --git a/.abf.yml b/.abf.yml
index 41506b4..6ecbf0c 100644
--- a/.abf.yml
+++ b/.abf.yml
@@ -1,2 +1,2 @@
 sources:
-  libjpeg-turbo-2.0.6.tar.gz: 5406c7676d7df89fb4da791ad5af51202910fb25
+  libjpeg-turbo-2.1.0.tar.gz: 6bf63c869105d341011cd4915816de888338231a
diff --git a/libjpeg-turbo-2.0.0-CVE-2017-15232.patch b/libjpeg-turbo-2.0.0-CVE-2017-15232.patch
deleted file mode 100644
index 22cb61b..0000000
--- a/libjpeg-turbo-2.0.0-CVE-2017-15232.patch
+++ /dev/null
@@ -1,40 +0,0 @@
-diff -rupN libjpeg-turbo-2.0.0.old/jdpostct.c libjpeg-turbo-2.0.0/jdpostct.c
---- libjpeg-turbo-2.0.0.old/jdpostct.c	2018-07-27 18:47:48.000000000 +0200
-+++ libjpeg-turbo-2.0.0/jdpostct.c	2018-08-01 16:10:01.647692751 +0200
-@@ -137,6 +137,11 @@ post_process_1pass(j_decompress_ptr cinf
-   my_post_ptr post = (my_post_ptr)cinfo->post;
-   JDIMENSION num_rows, max_rows;
- 
-+  /* read_and_discard_scanlines may call it with rows "available", but no buffer */
-+  if (output_buf == NULL) {
-+    return;
-+  }
-+
-   /* Fill the buffer, but not more than what we can dump out in one go. */
-   /* Note we rely on the upsampler to detect bottom of image. */
-   max_rows = out_rows_avail - *out_row_ctr;
-diff -rupN libjpeg-turbo-2.0.0.old/jerror.h libjpeg-turbo-2.0.0/jerror.h
---- libjpeg-turbo-2.0.0.old/jerror.h	2018-07-27 18:47:48.000000000 +0200
-+++ libjpeg-turbo-2.0.0/jerror.h	2018-08-01 16:11:31.554054434 +0200
-@@ -207,6 +207,7 @@ JMESSAGE(JWRN_ARITH_BAD_CODE, "Corrupt J
- #endif
- #endif
- JMESSAGE(JWRN_BOGUS_ICC, "Corrupt JPEG data: bad ICC marker")
-+JMESSAGE(JERR_BAD_PARAM, "Bogus parameter")
- 
- #ifdef JMAKE_ENUM_LIST
- 
-diff -rupN libjpeg-turbo-2.0.0.old/jquant1.c libjpeg-turbo-2.0.0/jquant1.c
---- libjpeg-turbo-2.0.0.old/jquant1.c	2018-07-27 18:47:48.000000000 +0200
-+++ libjpeg-turbo-2.0.0/jquant1.c	2018-08-01 16:12:44.437913930 +0200
-@@ -532,6 +532,10 @@ quantize_ord_dither(j_decompress_ptr cin
-   JDIMENSION col;
-   JDIMENSION width = cinfo->output_width;
- 
-+  if (output_buf == NULL && num_rows) {
-+    ERREXIT(cinfo, JERR_BAD_PARAM);
-+  }  
-+
-   for (row = 0; row < num_rows; row++) {
-     /* Initialize output values to 0 so can process components separately */
-     jzero_far((void *)output_buf[row], (size_t)(width * sizeof(JSAMPLE)));
diff --git a/libjpeg-turbo-2.0.0-fix-lib-path.patch b/libjpeg-turbo-2.0.0-fix-lib-path.patch
deleted file mode 100644
index d0ba187..0000000
--- a/libjpeg-turbo-2.0.0-fix-lib-path.patch
+++ /dev/null
@@ -1,12 +0,0 @@
-diff -rupN libjpeg-turbo-2.0.0.old/CMakeLists.txt libjpeg-turbo-2.0.0/CMakeLists.txt
---- libjpeg-turbo-2.0.0.old/CMakeLists.txt	2018-07-27 18:47:48.000000000 +0200
-+++ libjpeg-turbo-2.0.0/CMakeLists.txt	2018-08-01 16:24:47.625312813 +0200
-@@ -102,7 +102,7 @@ if(CMAKE_INSTALL_PREFIX STREQUAL "${CMAK
-     if(BITS EQUAL 64)
-       set(CMAKE_INSTALL_DEFAULT_LIBDIR "lib64")
-     else()
--      set(CMAKE_INSTALL_DEFAULT_LIBDIR "lib32")
-+      set(CMAKE_INSTALL_DEFAULT_LIBDIR "lib")
-     endif()
-   endif()
- endif()
diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec
index 9575536..a4cc0fb 100644
--- a/libjpeg-turbo.spec
+++ b/libjpeg-turbo.spec
@@ -12,7 +12,7 @@
 Summary:	A MMX/SSE2 accelerated library for manipulating JPEG image files
 Name:		libjpeg-turbo
 Epoch:		1
-Version:	2.0.6
+Version:	2.1.0
 Release:	1
 License:	wxWindows Library License
 Group:		System/Libraries
@@ -25,8 +25,6 @@ Source0:	https://sourceforge.net/projects/libjpeg-turbo/files/%{version}/%{name}
 Source2:	http://jpegclub.org/jpegexiforient.c
 Source3:	http://jpegclub.org/exifautotran.txt
 Patch0:		jpeg-6b-c++fixes.patch
-Patch1:		libjpeg-turbo-2.0.0-CVE-2017-15232.patch
-Patch2:		libjpeg-turbo-2.0.0-fix-lib-path.patch
 BuildRequires:	cmake
 BuildRequires:	libtool >= 1.4
 %ifarch %{ix86} x86_64
@@ -107,6 +105,7 @@ library.
 %{_includedir}/*.h
 %{_libdir}/pkgconfig/libjpeg.pc
 %{_libdir}/pkgconfig/libturbojpeg.pc
+%{_libdir}/cmake/%{name}/*.cmake
 
 #----------------------------------------------------------------------------
 
@@ -160,8 +159,6 @@ have orientation markings in the EXIF data.
 %prep
 %setup -q
 %patch0 -p0
-%patch1 -p1
-%patch2 -p1
 
 cp %{SOURCE2} jpegexiforient.c
 cp %{SOURCE3} exifautotran

From 894f13f78028318e23868e72182c06174f3543f5 Mon Sep 17 00:00:00 2001
From: Andrey Grigorev <survolog@yandex.ru>
Date: Wed, 11 Aug 2021 12:07:42 +0000
Subject: [PATCH 21/32] MassBuild#2340: Increase release tag

---
 libjpeg-turbo.spec | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec
index a4cc0fb..8544378 100644
--- a/libjpeg-turbo.spec
+++ b/libjpeg-turbo.spec
@@ -13,7 +13,7 @@ Summary:	A MMX/SSE2 accelerated library for manipulating JPEG image files
 Name:		libjpeg-turbo
 Epoch:		1
 Version:	2.1.0
-Release:	1
+Release:	2
 License:	wxWindows Library License
 Group:		System/Libraries
 Url:		http://www.libjpeg-turbo.org

From 872d3449185580d7c24c1c286572758bd57520d5 Mon Sep 17 00:00:00 2001
From: Alexander Stefanov <alexander@mezon.ru>
Date: Wed, 25 May 2022 20:54:55 +0000
Subject: [PATCH 22/32] version autoupdate [2.1.3]

---
 .abf.yml           | 2 +-
 libjpeg-turbo.spec | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.abf.yml b/.abf.yml
index 6ecbf0c..37b6818 100644
--- a/.abf.yml
+++ b/.abf.yml
@@ -1,2 +1,2 @@
 sources:
-  libjpeg-turbo-2.1.0.tar.gz: 6bf63c869105d341011cd4915816de888338231a
+  libjpeg-turbo-2.1.3.tar.gz: 6dec48193bb27e1c07abae8230031ce9ecb1cfec
diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec
index 8544378..092dd05 100644
--- a/libjpeg-turbo.spec
+++ b/libjpeg-turbo.spec
@@ -12,8 +12,8 @@
 Summary:	A MMX/SSE2 accelerated library for manipulating JPEG image files
 Name:		libjpeg-turbo
 Epoch:		1
-Version:	2.1.0
-Release:	2
+Version:	2.1.3
+Release:	1
 License:	wxWindows Library License
 Group:		System/Libraries
 Url:		http://www.libjpeg-turbo.org

From 6f9e8675f02fd694751ec1542daf77851a80ac1e Mon Sep 17 00:00:00 2001
From: Aleksandr Proklov <a.proklov@rosalinux.ru>
Date: Sat, 21 Jan 2023 04:51:14 +0300
Subject: [PATCH 23/32] Update version to 2.1.4

---
 .abf.yml           | 2 +-
 libjpeg-turbo.spec | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.abf.yml b/.abf.yml
index 37b6818..471d715 100644
--- a/.abf.yml
+++ b/.abf.yml
@@ -1,2 +1,2 @@
 sources:
-  libjpeg-turbo-2.1.3.tar.gz: 6dec48193bb27e1c07abae8230031ce9ecb1cfec
+  libjpeg-turbo-2.1.4.tar.gz: 5a355c08caa326cef7c2a61e062edfe8dd02ac07
diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec
index 092dd05..67966ab 100644
--- a/libjpeg-turbo.spec
+++ b/libjpeg-turbo.spec
@@ -12,7 +12,7 @@
 Summary:	A MMX/SSE2 accelerated library for manipulating JPEG image files
 Name:		libjpeg-turbo
 Epoch:		1
-Version:	2.1.3
+Version:	2.1.4
 Release:	1
 License:	wxWindows Library License
 Group:		System/Libraries

From 63ff62a2ea8c92ff83ebab7b6afe9d9f327f2e71 Mon Sep 17 00:00:00 2001
From: Alexander Stefanov <alexander@mezon.ru>
Date: Wed, 15 Feb 2023 23:39:48 +0000
Subject: [PATCH 24/32] version autoupdate [2.1.5.1]

---
 .abf.yml           | 2 +-
 libjpeg-turbo.spec | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.abf.yml b/.abf.yml
index 471d715..b93af24 100644
--- a/.abf.yml
+++ b/.abf.yml
@@ -1,2 +1,2 @@
 sources:
-  libjpeg-turbo-2.1.4.tar.gz: 5a355c08caa326cef7c2a61e062edfe8dd02ac07
+  libjpeg-turbo-2.1.5.1.tar.gz: 3ec9f6a19781a583285d93c2c4653f3dbe845fcc
diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec
index 67966ab..fd24628 100644
--- a/libjpeg-turbo.spec
+++ b/libjpeg-turbo.spec
@@ -12,7 +12,7 @@
 Summary:	A MMX/SSE2 accelerated library for manipulating JPEG image files
 Name:		libjpeg-turbo
 Epoch:		1
-Version:	2.1.4
+Version:	2.1.5.1
 Release:	1
 License:	wxWindows Library License
 Group:		System/Libraries

From c0ef24082445f368c4af25db46e84105e4bfdc09 Mon Sep 17 00:00:00 2001
From: Aleksandr Proklov <a.proklov@rosalinux.ru>
Date: Wed, 6 Mar 2024 11:07:59 +0900
Subject: [PATCH 25/32] Update version to 3.0.1

---
 .abf.yml           | 2 +-
 libjpeg-turbo.spec | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.abf.yml b/.abf.yml
index b93af24..e40fea7 100644
--- a/.abf.yml
+++ b/.abf.yml
@@ -1,2 +1,2 @@
 sources:
-  libjpeg-turbo-2.1.5.1.tar.gz: 3ec9f6a19781a583285d93c2c4653f3dbe845fcc
+  libjpeg-turbo-3.0.1.tar.gz: 965299edf1bfff8a604f302c7a0d249309339236
diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec
index fd24628..5f38e75 100644
--- a/libjpeg-turbo.spec
+++ b/libjpeg-turbo.spec
@@ -12,7 +12,7 @@
 Summary:	A MMX/SSE2 accelerated library for manipulating JPEG image files
 Name:		libjpeg-turbo
 Epoch:		1
-Version:	2.1.5.1
+Version:	3.0.1
 Release:	1
 License:	wxWindows Library License
 Group:		System/Libraries

From 9cfeeb5e4f9ba850b50073ab468708465b210a21 Mon Sep 17 00:00:00 2001
From: Giovanni Mariani <mc2374@mclink.it>
Date: Sun, 10 Mar 2024 16:27:46 +0100
Subject: [PATCH 26/32] Updated to release 3.0.2

---
 .abf.yml           | 2 +-
 libjpeg-turbo.spec | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.abf.yml b/.abf.yml
index e40fea7..a52b704 100644
--- a/.abf.yml
+++ b/.abf.yml
@@ -1,2 +1,2 @@
 sources:
-  libjpeg-turbo-3.0.1.tar.gz: 965299edf1bfff8a604f302c7a0d249309339236
+  libjpeg-turbo-3.0.2.tar.gz: b6c5d5081ced8502eb1e1e72f1f5cc2856ce90ee
diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec
index 5f38e75..6497ff7 100644
--- a/libjpeg-turbo.spec
+++ b/libjpeg-turbo.spec
@@ -12,10 +12,10 @@
 Summary:	A MMX/SSE2 accelerated library for manipulating JPEG image files
 Name:		libjpeg-turbo
 Epoch:		1
-Version:	3.0.1
+Version:	3.0.2
 Release:	1
 License:	wxWindows Library License
-Group:		System/Libraries
+Group:		Graphics
 Url:		http://www.libjpeg-turbo.org
 Source0:	https://sourceforge.net/projects/libjpeg-turbo/files/%{version}/%{name}-%{version}.tar.gz
 # These two allow automatic lossless rotation of JPEG images from a digital
@@ -201,5 +201,5 @@ install -m755 exifautotran -D %{buildroot}%{_bindir}/exifautotran
 #(neoclust) Provide jpegint.h because it is needed by certain software
 install -m644 jpegint.h -D %{buildroot}%{_includedir}/jpegint.h
 
-# cleanup
+# Cleanup
 rm -rf %{buildroot}%{_docdir}/*

From cdc13abac1cec10e31354a9834bf080b05766996 Mon Sep 17 00:00:00 2001
From: Alexander Stefanov <alexander@mezon.ru>
Date: Sun, 31 Mar 2024 04:34:43 +0300
Subject: [PATCH 27/32] add e2k

---
 libjpeg-turbo-3.0.2-e2k.patch | 4897 +++++++++++++++++++++++++++++++++
 libjpeg-turbo.spec            |   10 +-
 2 files changed, 4903 insertions(+), 4 deletions(-)
 create mode 100644 libjpeg-turbo-3.0.2-e2k.patch

diff --git a/libjpeg-turbo-3.0.2-e2k.patch b/libjpeg-turbo-3.0.2-e2k.patch
new file mode 100644
index 0000000..a80f1d3
--- /dev/null
+++ b/libjpeg-turbo-3.0.2-e2k.patch
@@ -0,0 +1,4897 @@
+From 5c6ff06bc9aec237e1ba222a3dde057cbfa81c9d Mon Sep 17 00:00:00 2001
+From: Ilya Kurdyukov <jpegqs@gmail.com>
+Date: Fri, 9 Feb 2024 09:57:55 +0700
+Subject: [PATCH] libjpeg-turbo-3.0.2 e2k support
+
+---
+ CMakeLists.txt           |   5 +
+ simd/CMakeLists.txt      |  23 ++
+ simd/e2k/jccolext-e2k.c  | 213 +++++++++++
+ simd/e2k/jccolor-e2k.c   | 163 +++++++++
+ simd/e2k/jchuff-e2k.c    | 307 ++++++++++++++++
+ simd/e2k/jcphuff-e2k.c   | 145 ++++++++
+ simd/e2k/jcsample-e2k.c  | 203 +++++++++++
+ simd/e2k/jcsample.h      |  28 ++
+ simd/e2k/jdcolext-e2k.c  | 258 +++++++++++++
+ simd/e2k/jdcolor-e2k.c   | 289 +++++++++++++++
+ simd/e2k/jdcoltab-e2k.c  |  80 ++++
+ simd/e2k/jdsample-e2k.c  | 389 ++++++++++++++++++++
+ simd/e2k/jfdctflt-e2k.c  | 127 +++++++
+ simd/e2k/jfdctfst-e2k.c  | 145 ++++++++
+ simd/e2k/jfdctint-e2k.c  | 255 +++++++++++++
+ simd/e2k/jidctflt-e2k.c  | 215 +++++++++++
+ simd/e2k/jidctfst-e2k.c  | 187 ++++++++++
+ simd/e2k/jidctint-e2k.c  | 294 +++++++++++++++
+ simd/e2k/jquantf-e2k.c   | 121 +++++++
+ simd/e2k/jquanti-e2k.c   | 178 +++++++++
+ simd/e2k/jsimd.c         | 761 +++++++++++++++++++++++++++++++++++++++
+ simd/e2k/jsimd_api_e2k.h |  94 +++++
+ simd/e2k/jsimd_e2k.h     | 207 +++++++++++
+ 23 files changed, 4687 insertions(+)
+ create mode 100644 simd/e2k/jccolext-e2k.c
+ create mode 100644 simd/e2k/jccolor-e2k.c
+ create mode 100644 simd/e2k/jchuff-e2k.c
+ create mode 100644 simd/e2k/jcphuff-e2k.c
+ create mode 100644 simd/e2k/jcsample-e2k.c
+ create mode 100644 simd/e2k/jcsample.h
+ create mode 100644 simd/e2k/jdcolext-e2k.c
+ create mode 100644 simd/e2k/jdcolor-e2k.c
+ create mode 100644 simd/e2k/jdcoltab-e2k.c
+ create mode 100644 simd/e2k/jdsample-e2k.c
+ create mode 100644 simd/e2k/jfdctflt-e2k.c
+ create mode 100644 simd/e2k/jfdctfst-e2k.c
+ create mode 100644 simd/e2k/jfdctint-e2k.c
+ create mode 100644 simd/e2k/jidctflt-e2k.c
+ create mode 100644 simd/e2k/jidctfst-e2k.c
+ create mode 100644 simd/e2k/jidctint-e2k.c
+ create mode 100644 simd/e2k/jquantf-e2k.c
+ create mode 100644 simd/e2k/jquanti-e2k.c
+ create mode 100644 simd/e2k/jsimd.c
+ create mode 100644 simd/e2k/jsimd_api_e2k.h
+ create mode 100644 simd/e2k/jsimd_e2k.h
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index adb0ca4..3b445a0 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -125,6 +125,9 @@ elseif(CMAKE_SYSTEM_PROCESSOR_LC STREQUAL "aarch64" OR
+ elseif(CMAKE_SYSTEM_PROCESSOR_LC MATCHES "^ppc" OR
+   CMAKE_SYSTEM_PROCESSOR_LC MATCHES "^powerpc")
+   set(CPU_TYPE powerpc)
++elseif(CMAKE_SYSTEM_PROCESSOR_LC STREQUAL "e2k" OR
++  CMAKE_SYSTEM_PROCESSOR_LC STREQUAL "elbrus")
++  set(CPU_TYPE e2k)
+ else()
+   set(CPU_TYPE ${CMAKE_SYSTEM_PROCESSOR_LC})
+ endif()
+@@ -906,6 +909,8 @@ if(CPU_TYPE STREQUAL "x86_64" OR CPU_TYPE STREQUAL "i386")
+   elseif(CPU_TYPE STREQUAL "x86_64")
+     set(DEFAULT_FLOATTEST8 no-fp-contract)
+   endif()
++elseif(WITH_SIMD AND CPU_TYPE STREQUAL "e2k")
++  set(DEFAULT_FLOATTEST8 sse)
+ elseif(CPU_TYPE STREQUAL "powerpc" OR CPU_TYPE STREQUAL "arm64")
+   if(CMAKE_C_COMPILER_ID STREQUAL "Clang")
+     if(CMAKE_C_COMPILER_VERSION VERSION_EQUAL 14.0.0 OR
+diff --git a/simd/CMakeLists.txt b/simd/CMakeLists.txt
+index 6024900..789fa3f 100644
+--- a/simd/CMakeLists.txt
++++ b/simd/CMakeLists.txt
+@@ -531,6 +531,29 @@ if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED)
+   set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1)
+ endif()
+ 
++###############################################################################
++# Elbrus (Intrinsics)
++###############################################################################
++
++elseif(CPU_TYPE STREQUAL "e2k")
++
++set(SIMD_SOURCES e2k/jquanti-e2k.c e2k/jquantf-e2k.c
++  e2k/jccolor-e2k.c e2k/jcsample-e2k.c
++  e2k/jdcolor-e2k.c e2k/jdsample-e2k.c
++  e2k/jfdctint-e2k.c e2k/jfdctfst-e2k.c e2k/jfdctflt-e2k.c
++  e2k/jidctint-e2k.c e2k/jidctfst-e2k.c e2k/jidctflt-e2k.c
++  e2k/jchuff-e2k.c e2k/jcphuff-e2k.c)
++
++set_source_files_properties(${SIMD_SOURCES} PROPERTIES
++  COMPILE_FLAGS -msse4.1)
++
++set(SIMD_SOURCES ${SIMD_SOURCES} e2k/jsimd.c)
++
++add_library(simd OBJECT ${SIMD_SOURCES})
++
++if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED)
++  set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1)
++endif()
+ 
+ ###############################################################################
+ # None
+diff --git a/simd/e2k/jccolext-e2k.c b/simd/e2k/jccolext-e2k.c
+new file mode 100644
+index 0000000..49abdb4
+--- /dev/null
++++ b/simd/e2k/jccolext-e2k.c
+@@ -0,0 +1,213 @@
++/*
++ * Elbrus optimizations for libjpeg-turbo
++ *
++ * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
++ * Copyright (C) 2014, Jay Foad.  All Rights Reserved.
++ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
++ *
++ * This software is provided 'as-is', without any express or implied
++ * warranty.  In no event will the authors be held liable for any damages
++ * arising from the use of this software.
++ *
++ * Permission is granted to anyone to use this software for any purpose,
++ * including commercial applications, and to alter it and redistribute it
++ * freely, subject to the following restrictions:
++ *
++ * 1. The origin of this software must not be misrepresented; you must not
++ *    claim that you wrote the original software. If you use this software
++ *    in a product, an acknowledgment in the product documentation would be
++ *    appreciated but is not required.
++ * 2. Altered source versions must be plainly marked as such, and must not be
++ *    misrepresented as being the original software.
++ * 3. This notice may not be removed or altered from any source distribution.
++ */
++
++/* This file is included by jccolor-e2k.c */
++
++void rgbn_ycc_convert(JDIMENSION img_width, JSAMPARRAY input_buf,
++                      JSAMPIMAGE output_buf, JDIMENSION output_row,
++                      int num_rows, int shuf_idx)
++{
++  JSAMPROW inptr, outptr0, outptr1, outptr2;
++  unsigned char __attribute__((aligned(16))) tmpbuf[PIXELSIZE * 16];
++
++  __m128i pb_zero = _mm_setzero_si128();
++  __m128i pb_shuf0 = VEC_LD(rgb_ycc_shuf_const[shuf_idx]);
++#if PIXELSIZE == 4
++  __m128i rgb3 = pb_zero;
++#else
++  __m128i pb_shuf4 = VEC_LD(rgb_ycc_shuf_const[shuf_idx] + 16);
++#endif
++  __m128i rgb0, rgb1 = pb_zero, rgb2 = pb_zero,
++    rgbg0, rgbg1, rgbg2, rgbg3, rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
++  __m128i y, yl, yh, y0, y1, y2, y3;
++  __m128i cb, cr, crl, crh, cbl, cbh;
++  __m128i cr0, cr1, cr2, cr3, cb0, cb1, cb2, cb3;
++
++  /* Constants */
++  __m128i pw_f0299_f0337 = _mm_setr_epi16(__4X2(F_0_299, F_0_337)),
++    pw_f0114_f0250 = _mm_setr_epi16(__4X2(F_0_114, F_0_250)),
++    pw_mf016_mf033 = _mm_setr_epi16(__4X2(-F_0_168, -F_0_331)),
++    pw_mf008_mf041 = _mm_setr_epi16(__4X2(-F_0_081, -F_0_418)),
++    pw_mf050_f000 = _mm_setr_epi16(__4X2(-F_0_500, 0)),
++    pd_onehalf = _mm_set1_epi32(ONE_HALF),
++    pd_onehalfm1_cj = _mm_set1_epi32(ONE_HALF - 1 + (CENTERJSAMPLE << SCALEBITS));
++#ifdef NEED_ALIGN8
++  ALIGN8_COMMON
++  ALIGN8_VARS(src)
++#endif
++
++  if (img_width > 0)
++  while (--num_rows >= 0) {
++    int num_cols;
++    inptr = *input_buf++;
++    outptr0 = output_buf[0][output_row];
++    outptr1 = output_buf[1][output_row];
++    outptr2 = output_buf[2][output_row];
++    output_row++;
++
++    if (img_width >= 16) {
++#ifdef NEED_ALIGN8
++      ALIGN8_START(inptr, src)
++      inptr += (img_width & -16) * PIXELSIZE;
++#endif
++
++      PRAGMA_E2K("ivdep")
++      for (num_cols = img_width; num_cols >= 16; num_cols -= 16,
++           outptr0 += 16, outptr1 += 16, outptr2 += 16) {
++#ifdef NEED_ALIGN8
++        ALIGN8_READ16(rgb0, src, 0)
++        ALIGN8_READ16(rgb1, src, 1)
++        ALIGN8_READ16(rgb2, src, 2)
++#if PIXELSIZE == 4
++        ALIGN8_READ16(rgb3, src, 3)
++#endif
++        src_ptr += PIXELSIZE * 2;
++#else
++        rgb0 = VEC_LD(inptr);
++        rgb1 = VEC_LD(inptr + 16);
++        rgb2 = VEC_LD(inptr + 32);
++#if PIXELSIZE == 4
++        rgb3 = VEC_LD(inptr + 48);
++#endif
++        inptr += PIXELSIZE * 16;
++#endif
++        RGB_SHUFFLE
++        CALC_Y(outptr0)
++        CALC_CC(outptr1, outptr2)
++      }
++    }
++
++    num_cols = img_width & 15;
++    if (num_cols) {
++      int i;
++      memcpy(tmpbuf, inptr, num_cols * PIXELSIZE);
++      rgb0 = VEC_LD(tmpbuf);
++      rgb1 = VEC_LD(tmpbuf + 16);
++      rgb2 = VEC_LD(tmpbuf + 32);
++#if PIXELSIZE == 4
++      rgb3 = VEC_LD(tmpbuf + 48);
++#endif
++      RGB_SHUFFLE
++      CALC_Y(tmpbuf)
++      CALC_CC(tmpbuf + 16, tmpbuf + 32)
++
++      for (i = 0; i < num_cols; i++) {
++        outptr0[i] = tmpbuf[i];
++        outptr1[i] = tmpbuf[i + 16];
++        outptr2[i] = tmpbuf[i + 32];
++      }
++    }
++  }
++}
++
++void rgbn_gray_convert(JDIMENSION img_width, JSAMPARRAY input_buf,
++                       JSAMPIMAGE output_buf, JDIMENSION output_row,
++                       int num_rows, int shuf_idx)
++{
++  JSAMPROW inptr, outptr;
++  uint8_t __attribute__((aligned(16))) tmpbuf[PIXELSIZE * 16];
++
++  __m128i pb_zero = _mm_setzero_si128();
++  __m128i pb_shuf0 = VEC_LD(rgb_ycc_shuf_const[shuf_idx]);
++#if PIXELSIZE == 4
++  __m128i rgb3 = pb_zero;
++#else
++  __m128i pb_shuf4 = VEC_LD(rgb_ycc_shuf_const[shuf_idx] + 16);
++#endif
++  __m128i rgb0, rgb1 = pb_zero, rgb2 = pb_zero,
++    rgbg0, rgbg1, rgbg2, rgbg3, rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
++  __m128i y, yl, yh, y0, y1, y2, y3;
++
++  /* Constants */
++  __m128i pw_f0299_f0337 = _mm_setr_epi16(__4X2(F_0_299, F_0_337)),
++    pw_f0114_f0250 = _mm_setr_epi16(__4X2(F_0_114, F_0_250)),
++    pd_onehalf = _mm_set1_epi32(ONE_HALF);
++#ifdef NEED_ALIGN8
++  ALIGN8_COMMON
++  ALIGN8_VARS(src)
++#endif
++
++  if (img_width > 0)
++  while (--num_rows >= 0) {
++    int num_cols;
++    inptr = *input_buf++;
++    outptr = output_buf[0][output_row];
++    output_row++;
++
++    if (img_width >= 16) {
++#ifdef NEED_ALIGN8
++      ALIGN8_START(inptr, src)
++      inptr += (img_width & -16) * PIXELSIZE;
++#endif
++
++      PRAGMA_E2K("ivdep")
++      for (num_cols = img_width; num_cols >= 16; num_cols -= 16,
++           outptr += 16) {
++#ifdef NEED_ALIGN8
++        ALIGN8_READ16(rgb0, src, 0)
++        ALIGN8_READ16(rgb1, src, 1)
++        ALIGN8_READ16(rgb2, src, 2)
++#if PIXELSIZE == 4
++        ALIGN8_READ16(rgb3, src, 3)
++#endif
++        src_ptr += PIXELSIZE * 2;
++#else
++        rgb0 = VEC_LD(inptr);
++        rgb1 = VEC_LD(inptr + 16);
++        rgb2 = VEC_LD(inptr + 32);
++#if PIXELSIZE == 4
++        rgb3 = VEC_LD(inptr + 48);
++#endif
++        inptr += PIXELSIZE * 16;
++#endif
++        RGB_SHUFFLE
++        CALC_Y(outptr)
++      }
++    }
++
++    num_cols = img_width & 15;
++    if (num_cols) {
++      int i;
++      memcpy(tmpbuf, inptr, num_cols * PIXELSIZE);
++      rgb0 = VEC_LD(tmpbuf);
++      rgb1 = VEC_LD(tmpbuf + 16);
++      rgb2 = VEC_LD(tmpbuf + 32);
++#if PIXELSIZE == 4
++      rgb3 = VEC_LD(tmpbuf + 48);
++#endif
++      RGB_SHUFFLE
++      CALC_Y(tmpbuf)
++
++      for (i = 0; i < num_cols; i++) {
++        outptr[i] = tmpbuf[i];
++      }
++    }
++  }
++}
++
++#undef RGB_SHUFFLE
++#undef PIXELSIZE
++#undef rgbn_ycc_convert
++#undef rgbn_gray_convert
++
+diff --git a/simd/e2k/jccolor-e2k.c b/simd/e2k/jccolor-e2k.c
+new file mode 100644
+index 0000000..0af2626
+--- /dev/null
++++ b/simd/e2k/jccolor-e2k.c
+@@ -0,0 +1,163 @@
++/*
++ * Elbrus optimizations for libjpeg-turbo
++ *
++ * Copyright (C) 2014, D. R. Commander.  All Rights Reserved.
++ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
++ *
++ * This software is provided 'as-is', without any express or implied
++ * warranty.  In no event will the authors be held liable for any damages
++ * arising from the use of this software.
++ *
++ * Permission is granted to anyone to use this software for any purpose,
++ * including commercial applications, and to alter it and redistribute it
++ * freely, subject to the following restrictions:
++ *
++ * 1. The origin of this software must not be misrepresented; you must not
++ *    claim that you wrote the original software. If you use this software
++ *    in a product, an acknowledgment in the product documentation would be
++ *    appreciated but is not required.
++ * 2. Altered source versions must be plainly marked as such, and must not be
++ *    misrepresented as being the original software.
++ * 3. This notice may not be removed or altered from any source distribution.
++ */
++
++/* RGB --> YCC CONVERSION */
++
++#include "jsimd_e2k.h"
++
++#define F_0_081  5329                 /* FIX(0.08131) */
++#define F_0_114  7471                 /* FIX(0.11400) */
++#define F_0_168  11059                /* FIX(0.16874) */
++#define F_0_250  16384                /* FIX(0.25000) */
++#define F_0_299  19595                /* FIX(0.29900) */
++#define F_0_331  21709                /* FIX(0.33126) */
++#define F_0_418  27439                /* FIX(0.41869) */
++#define F_0_500  32768                /* FIX(0.50000) */
++#define F_0_587  38470                /* FIX(0.58700) */
++#define F_0_337  (F_0_587 - F_0_250)  /* FIX(0.58700) - FIX(0.25000) */
++#define F_0_413  (65536 - F_0_587)    /* FIX(1.00000) - FIX(0.58700) */
++
++#define SCALEBITS  16
++#define ONE_HALF  (1 << (SCALEBITS - 1))
++
++#define RGBG_INDEX_(name, color, i, x) \
++  name##_##color + i * name##_PIXELSIZE + x, \
++  name##_GREEN + i * name##_PIXELSIZE + x
++#define RGBG_INDEX(name, x) \
++  RGBG_INDEX_(name, RED, 0, x), RGBG_INDEX_(name, RED, 1, x), \
++  RGBG_INDEX_(name, RED, 2, x), RGBG_INDEX_(name, RED, 3, x), \
++  RGBG_INDEX_(name, BLUE, 0, x), RGBG_INDEX_(name, BLUE, 1, x), \
++  RGBG_INDEX_(name, BLUE, 2, x), RGBG_INDEX_(name, BLUE, 3, x)
++
++static const uint8_t __attribute__((aligned(16)))
++rgb_ycc_shuf_const[7][32] = {
++  { RGBG_INDEX(RGB, 0), RGBG_INDEX(RGB, 4) },
++  { RGBG_INDEX(EXT_RGB, 0), RGBG_INDEX(EXT_RGB, 4) },
++  { RGBG_INDEX(EXT_RGBX, 0), RGBG_INDEX(EXT_RGBX, 4) },
++  { RGBG_INDEX(EXT_BGR, 0), RGBG_INDEX(EXT_BGR, 4) },
++  { RGBG_INDEX(EXT_BGRX, 0), RGBG_INDEX(EXT_BGRX, 4) },
++  { RGBG_INDEX(EXT_XBGR, 0), RGBG_INDEX(EXT_XBGR, 4) },
++  { RGBG_INDEX(EXT_XRGB, 0), RGBG_INDEX(EXT_XRGB, 4) }
++};
++
++  /* rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3 
++   * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
++   * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
++   * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
++   *
++   * rg0 = R0 G0 R1 G1 R2 G2 R3 G3
++   * bg0 = B0 G0 B1 G1 B2 G2 B3 G3
++   * ...
++   */
++
++  /* (Original)
++   * Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
++   * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
++   * Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
++   *
++   * (This implementation)
++   * Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
++   * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
++   * Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
++   */
++
++#define CALC_Y(outptr0) \
++  rg0 = _mm_unpacklo_epi8(rgbg0, pb_zero); \
++  bg0 = _mm_unpackhi_epi8(rgbg0, pb_zero); \
++  rg1 = _mm_unpacklo_epi8(rgbg1, pb_zero); \
++  bg1 = _mm_unpackhi_epi8(rgbg1, pb_zero); \
++  rg2 = _mm_unpacklo_epi8(rgbg2, pb_zero); \
++  bg2 = _mm_unpackhi_epi8(rgbg2, pb_zero); \
++  rg3 = _mm_unpacklo_epi8(rgbg3, pb_zero); \
++  bg3 = _mm_unpackhi_epi8(rgbg3, pb_zero); \
++  \
++  /* Calculate Y values */ \
++  y0 = _mm_add_epi32(_mm_madd_epi16(rg0, pw_f0299_f0337), pd_onehalf); \
++  y1 = _mm_add_epi32(_mm_madd_epi16(rg1, pw_f0299_f0337), pd_onehalf); \
++  y2 = _mm_add_epi32(_mm_madd_epi16(rg2, pw_f0299_f0337), pd_onehalf); \
++  y3 = _mm_add_epi32(_mm_madd_epi16(rg3, pw_f0299_f0337), pd_onehalf); \
++  y0 = _mm_add_epi32(_mm_madd_epi16(bg0, pw_f0114_f0250), y0); \
++  y1 = _mm_add_epi32(_mm_madd_epi16(bg1, pw_f0114_f0250), y1); \
++  y2 = _mm_add_epi32(_mm_madd_epi16(bg2, pw_f0114_f0250), y2); \
++  y3 = _mm_add_epi32(_mm_madd_epi16(bg3, pw_f0114_f0250), y3); \
++  \
++  yl = _mm_packhi_epi32(y0, y1); \
++  yh = _mm_packhi_epi32(y2, y3); \
++  y = _mm_packus_epi16(yl, yh); \
++  VEC_ST(outptr0, y);
++
++#define CALC_CC(outptr1, outptr2) \
++  /* Calculate Cb values */ \
++  cb0 = _mm_add_epi32(_mm_madd_epi16(rg0, pw_mf016_mf033), pd_onehalfm1_cj); \
++  cb1 = _mm_add_epi32(_mm_madd_epi16(rg1, pw_mf016_mf033), pd_onehalfm1_cj); \
++  cb2 = _mm_add_epi32(_mm_madd_epi16(rg2, pw_mf016_mf033), pd_onehalfm1_cj); \
++  cb3 = _mm_add_epi32(_mm_madd_epi16(rg3, pw_mf016_mf033), pd_onehalfm1_cj); \
++  cb0 = _mm_sub_epi32(cb0, _mm_madd_epi16(bg0, pw_mf050_f000)); \
++  cb1 = _mm_sub_epi32(cb1, _mm_madd_epi16(bg1, pw_mf050_f000)); \
++  cb2 = _mm_sub_epi32(cb2, _mm_madd_epi16(bg2, pw_mf050_f000)); \
++  cb3 = _mm_sub_epi32(cb3, _mm_madd_epi16(bg3, pw_mf050_f000)); \
++  \
++  cbl = _mm_packhi_epi32(cb0, cb1); \
++  cbh = _mm_packhi_epi32(cb2, cb3); \
++  cb = _mm_packus_epi16(cbl, cbh); \
++  VEC_ST(outptr1, cb); \
++  \
++  /* Calculate Cr values */ \
++  cr0 = _mm_add_epi32(_mm_madd_epi16(bg0, pw_mf008_mf041), pd_onehalfm1_cj); \
++  cr1 = _mm_add_epi32(_mm_madd_epi16(bg1, pw_mf008_mf041), pd_onehalfm1_cj); \
++  cr2 = _mm_add_epi32(_mm_madd_epi16(bg2, pw_mf008_mf041), pd_onehalfm1_cj); \
++  cr3 = _mm_add_epi32(_mm_madd_epi16(bg3, pw_mf008_mf041), pd_onehalfm1_cj); \
++  cr0 = _mm_sub_epi32(cr0, _mm_madd_epi16(rg0, pw_mf050_f000)); \
++  cr1 = _mm_sub_epi32(cr1, _mm_madd_epi16(rg1, pw_mf050_f000)); \
++  cr2 = _mm_sub_epi32(cr2, _mm_madd_epi16(rg2, pw_mf050_f000)); \
++  cr3 = _mm_sub_epi32(cr3, _mm_madd_epi16(rg3, pw_mf050_f000)); \
++  \
++  crl = _mm_packhi_epi32(cr0, cr1); \
++  crh = _mm_packhi_epi32(cr2, cr3); \
++  cr = _mm_packus_epi16(crl, crh); \
++  VEC_ST(outptr2, cr);
++
++
++#define PIXELSIZE 3
++#define RGB_SHUFFLE \
++  rgbg0 = _mm_shuffle_epi8(rgb0, pb_shuf0); \
++  rgbg1 = _mm_shuffle_epi8(VEC_ALIGNR8(rgb1, rgb0), pb_shuf4); \
++  rgbg2 = _mm_shuffle_epi8(VEC_ALIGNR8(rgb2, rgb1), pb_shuf0); \
++  rgbg3 = _mm_shuffle_epi8(rgb2, pb_shuf4);
++
++#define rgbn_ycc_convert  jsimd_rgb3_ycc_convert_e2k
++#define rgbn_gray_convert jsimd_rgb3_gray_convert_e2k
++#include "jccolext-e2k.c"
++
++
++#define PIXELSIZE 4
++#define RGB_SHUFFLE \
++  rgbg0 = _mm_shuffle_epi8(rgb0, pb_shuf0); \
++  rgbg1 = _mm_shuffle_epi8(rgb1, pb_shuf0); \
++  rgbg2 = _mm_shuffle_epi8(rgb2, pb_shuf0); \
++  rgbg3 = _mm_shuffle_epi8(rgb3, pb_shuf0);
++
++#define rgbn_ycc_convert  jsimd_rgb4_ycc_convert_e2k
++#define rgbn_gray_convert jsimd_rgb4_gray_convert_e2k
++#include "jccolext-e2k.c"
++
+diff --git a/simd/e2k/jchuff-e2k.c b/simd/e2k/jchuff-e2k.c
+new file mode 100644
+index 0000000..ec4329e
+--- /dev/null
++++ b/simd/e2k/jchuff-e2k.c
+@@ -0,0 +1,307 @@
++/*
++ * Elbrus optimizations for libjpeg-turbo
++ *
++ * Copyright (C) 2022, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
++ *
++ * This software is provided 'as-is', without any express or implied
++ * warranty.  In no event will the authors be held liable for any damages
++ * arising from the use of this software.
++ *
++ * Permission is granted to anyone to use this software for any purpose,
++ * including commercial applications, and to alter it and redistribute it
++ * freely, subject to the following restrictions:
++ *
++ * 1. The origin of this software must not be misrepresented; you must not
++ *    claim that you wrote the original software. If you use this software
++ *    in a product, an acknowledgment in the product documentation would be
++ *    appreciated but is not required.
++ * 2. Altered source versions must be plainly marked as such, and must not be
++ *    misrepresented as being the original software.
++ * 3. This notice may not be removed or altered from any source distribution.
++ *
++ * NOTE: All referenced figures are from
++ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
++ */
++
++/* Encode a single block's worth of coefficients */
++
++#include "jsimd_e2k.h"
++
++#if __SIZEOF_SIZE_T__ != 8
++#error
++#endif
++
++typedef unsigned long long bit_buf_type;
++#define BIT_BUF_SIZE  64
++
++typedef struct {
++  bit_buf_type put_buffer;              /* current bit accumulation buffer */
++  int free_bits;                        /* # of bits available in it */
++  int last_dc_val[MAX_COMPS_IN_SCAN];   /* last DC coef for each component */
++} savable_state;
++
++typedef struct {
++  JOCTET *next_output_byte;     /* => next byte to write in buffer */
++  size_t free_in_buffer;        /* # of byte spaces remaining in buffer */
++  savable_state cur;            /* Current bit buffer & DC state */
++  j_compress_ptr cinfo;         /* dump_buffer needs access to this */
++  int simd;
++} working_state;
++
++#define EMIT_BYTE(b) { \
++  buffer[0] = (JOCTET)(b); \
++  buffer[1] = 0; \
++  buffer -= -2 + ((JOCTET)(b) < 0xFF); \
++}
++
++#define FLUSH() { \
++  if (put_buffer & 0x8080808080808080 & ~(put_buffer + 0x0101010101010101)) { \
++    EMIT_BYTE(put_buffer >> 56) \
++    EMIT_BYTE(put_buffer >> 48) \
++    EMIT_BYTE(put_buffer >> 40) \
++    EMIT_BYTE(put_buffer >> 32) \
++    EMIT_BYTE(put_buffer >> 24) \
++    EMIT_BYTE(put_buffer >> 16) \
++    EMIT_BYTE(put_buffer >>  8) \
++    EMIT_BYTE(put_buffer      ) \
++  } else { \
++    *(uint64_t*)buffer = __builtin_bswap64(put_buffer); \
++    buffer += 8; \
++  } \
++}
++
++#define PUT_AND_FLUSH(code, size) { \
++  put_buffer = (put_buffer << (size + free_bits)) | (code >> -free_bits); \
++  FLUSH() \
++  free_bits += BIT_BUF_SIZE; \
++  put_buffer = code; \
++}
++
++#define PUT_BITS(code, size) { \
++  free_bits -= size; \
++  if (free_bits < 0) \
++    PUT_AND_FLUSH(code, size) \
++  else \
++    put_buffer = (put_buffer << size) | code; \
++}
++
++#define PUT_CODE(code, size) { \
++  /* temp &= (((JLONG)1) << nbits) - 1; */ \
++  /* temp |= code << nbits; */ \
++  temp = __builtin_e2k_insfd(code, __builtin_e2k_insfd(nbits, 6 * 63 + 64, -nbits), temp); \
++  nbits += size; \
++  PUT_BITS(temp, nbits) \
++}
++
++#define KLOOP_PREPARE(mask, i) \
++  t0 = _mm_cmpeq_epi8(_mm_packs_epi16(v0, v1), zero); \
++  t1 = _mm_cmpeq_epi8(_mm_packs_epi16(v2, v3), zero); \
++  mask = (uint32_t)(_mm_movemask_epi8(t0) | _mm_movemask_epi8(t1) << 16); \
++  t0 = _mm_add_epi16(_mm_srai_epi16(v0, 15), v0); \
++  t1 = _mm_add_epi16(_mm_srai_epi16(v1, 15), v1); \
++  t2 = _mm_add_epi16(_mm_srai_epi16(v2, 15), v2); \
++  t3 = _mm_add_epi16(_mm_srai_epi16(v3, 15), v3); \
++  v0 = _mm_abs_epi16(v0); \
++  v1 = _mm_abs_epi16(v1); \
++  v2 = _mm_abs_epi16(v2); \
++  v3 = _mm_abs_epi16(v3); \
++  VEC_ST(block_nbits + i,      v0); \
++  VEC_ST(block_nbits + i +  8, v1); \
++  VEC_ST(block_nbits + i + 16, v2); \
++  VEC_ST(block_nbits + i + 24, v3); \
++  VEC_ST(block_diff + i,      t0); \
++  VEC_ST(block_diff + i +  8, t1); \
++  VEC_ST(block_diff + i + 16, t2); \
++  VEC_ST(block_diff + i + 24, t3);
++
++#define SHUF16X4(a, b, c, d) _mm_setr_pi8( \
++  a * 2, a * 2 + 1, b * 2, b * 2 + 1, c * 2, c * 2 + 1, d * 2, d * 2 + 1)
++#define VEC_COMBINE(h0, h1) _mm_unpacklo_epi64( \
++  _mm_movpi64_epi64(h0), _mm_movpi64_epi64(h1))
++#define INSFI_M64(a, b, c, d) _mm_cvtsi64_m64(__builtin_e2k_insfd( \
++  _mm_cvtm64_si64(a), (b & 63) | (d & 63) << 6, _mm_cvtm64_si64(c)))
++
++GLOBAL(JOCTET *)
++jsimd_huff_encode_one_block_e2k(void *state, JOCTET *buffer,
++                                JCOEFPTR block, int last_dc_val,
++                                c_derived_tbl *dctbl, c_derived_tbl *actbl) {
++  uint64_t temp, nbits;
++  uint64_t i, r, code, size;
++  uint64_t code_0xf0 = actbl->ehufco[0xf0];
++  uint64_t size_0xf0 = actbl->ehufsi[0xf0];
++
++  working_state *state_ptr = (working_state*)state;
++  bit_buf_type put_buffer = state_ptr->cur.put_buffer;
++  int64_t free_bits = state_ptr->cur.free_bits;
++
++  __m128i zero = _mm_setzero_si128();
++  __m128i v0, v1, v2, v3, t0, t1, t2, t3;
++  int64_t mask, mask1;
++  uint16_t __attribute__((aligned(16))) block_nbits[DCTSIZE2];
++  int16_t __attribute__((aligned(16))) block_diff[DCTSIZE2];
++
++#if 1 /* faster this way */
++  {
++    __m64 d0l, d0h, d1l, d1h, d2l, d2h, d3l, d3h;
++    __m64 d4l, d4h, d5l, d5h, d6l, d6h, d7l, d7h;
++    __m64 h0, h1, h2, h3, r0, r1, c1256 = SHUF16X4(1, 2, 5, 6);
++
++    d0l = *(__m64*)(block + 8 * 0); d0h = *(__m64*)(block + 8 * 0 + 4); //  0  4
++    d1l = *(__m64*)(block + 8 * 1); d1h = *(__m64*)(block + 8 * 1 + 4); //  8 12
++    d2l = *(__m64*)(block + 8 * 2); d2h = *(__m64*)(block + 8 * 2 + 4); // 16 20
++    d3l = *(__m64*)(block + 8 * 3); d3h = *(__m64*)(block + 8 * 3 + 4); // 24 28
++    d4l = *(__m64*)(block + 8 * 4); d4h = *(__m64*)(block + 8 * 4 + 4); // 32 36
++    d5l = *(__m64*)(block + 8 * 5); d5h = *(__m64*)(block + 8 * 5 + 4); // 40 44
++    d6l = *(__m64*)(block + 8 * 6); d6h = *(__m64*)(block + 8 * 6 + 4); // 48 52
++    d7l = *(__m64*)(block + 8 * 7); d7h = *(__m64*)(block + 8 * 7 + 4); // 56 60
++
++    // d0l[0] d0l[1] d1l[0] d2l[0]
++    // d1l[1] d0l[2] d0l[3] d1l[2]
++    h0 = _mm_unpacklo_pi16(d1l, d2l);
++    r0 = _mm_unpacklo_pi32(d0l, h0);
++    r1 = _mm_shuffle2_pi8(d1l, d0l, SHUF16X4(1, 6, 7, 2));
++    r0 = _mm_sub_pi16(r0, _mm_cvtsi64_m64((uint16_t)last_dc_val));
++    v0 = VEC_COMBINE(r0, r1);
++
++    // d2l[1] d3l[0] d4l[0] d3l[1]
++    // d2l[2] d1l[3] d0h[0] d0h[1]
++    h0 = _mm_srli_si64(_mm_unpacklo_pi32(d2l, d4l), 16);
++    h2 = INSFI_M64(d1l, 0, d2l, 48);
++    r0 = _mm_unpacklo_pi16(h0, d3l);
++    r1 = _mm_alignr_pi8(d0h, h2, 4);
++    v1 = VEC_COMBINE(r0, r1);
++
++    // d1h[0] d2l[3] d3l[2] d4l[1]
++    // d5l[0] d6l[0] d5l[1] d4l[2]
++    h0 = INSFI_M64(d2l, 32, d1h, 16);
++    h1 = INSFI_M64(d4l, -32, d3l, 48);
++    h2 = INSFI_M64(d4l, 16, d6l, 16);
++    r0 = INSFI_M64(h1, 0, h0, 32);
++    r1 = _mm_unpacklo_pi16(d5l, h2);
++    v2 = VEC_COMBINE(r0, r1);
++
++    // d3l[3] d2h[0] d1h[1] d0h[2]
++    // d0h[3] d1h[2] d2h[1] d3h[0]
++    h0 = _mm_alignr_pi8(d2h, d3l, 6);
++    h1 = INSFI_M64(d0h, 0, d1h, 32);
++    h2 = _mm_unpackhi_pi32(d0h, d1h);
++    h3 = _mm_unpacklo_pi32(d2h, d3h);
++    r0 = INSFI_M64(h1, -16, h0, 32);
++    r1 = _mm_shuffle2_pi8(h2, h3, c1256);
++    v3 = VEC_COMBINE(r0, r1);
++
++    KLOOP_PREPARE(mask, 0)
++
++    // d4l[3] d5l[2] d6l[1] d7l[0]
++    // d7l[1] d6l[2] d5l[3] d4h[0]
++    h0 = _mm_unpackhi_pi32(d4l, d5l);
++    h1 = _mm_unpacklo_pi32(d6l, d7l);
++    h2 = INSFI_M64(d6l, 0, d7l, 32);
++    h2 = INSFI_M64(d5l, 0, h2, 48);
++    r0 = _mm_shuffle2_pi8(h0, h1, c1256);
++    r1 = _mm_alignr_pi8(d4h, h2, 2);
++    v0 = VEC_COMBINE(r0, r1);
++
++    // d3h[1] d2h[2] d1h[3] d2h[3]
++    // d3h[2] d4h[1] d5h[0] d6l[3]
++    h0 = _mm_slli_si64(INSFI_M64(d1h, 16, d3h, 32), 16);
++    h2 = INSFI_M64(d4h, -32, d3h, 48);
++    h3 = INSFI_M64(d6l, 32, d5h, 16);
++    r0 = _mm_unpackhi_pi16(h0, d2h);
++    r1 = _mm_alignr_pi8(h3, h2, 4);
++    v1 = VEC_COMBINE(r0, r1);
++
++    // d7l[2] d7l[3] d6h[0] d5h[1]
++    // d4h[2] d3h[3] d4h[3] d5h[2]
++    h0 = INSFI_M64(d5h, 0, d6h, 16);
++    h2 = _mm_slli_si64(_mm_unpackhi_pi32(d3h, d5h), 16);
++    r0 = _mm_alignr_pi8(h0, d7l, 4);
++    r1 = _mm_unpackhi_pi16(d4h, h2);
++    v2 = VEC_COMBINE(r0, r1);
++
++    // d6h[1] d7h[0] d7h[1] d6h[2]
++    // d5h[3] d6h[3] d7h[2] d7h[3]
++    h0 = INSFI_M64(d6h, -16, d7h, 32);
++    h2 = _mm_unpackhi_pi16(d5h, d6h);
++    r0 = _mm_shuffle_pi16(h0, 0xd2);
++    r1 = _mm_unpackhi_pi32(h2, d7h);
++    v3 = VEC_COMBINE(r0, r1);
++  }
++#else
++  v0 = _mm_setr_epi16(
++      block[0] - last_dc_val, block[1], block[8], block[16],
++      block[9], block[2], block[3], block[10]);
++  v1 = _mm_setr_epi16(
++      block[17], block[24], block[32], block[25],
++      block[18], block[11], block[4], block[5]);
++  v2 = _mm_setr_epi16(
++      block[12], block[19], block[26], block[33],
++      block[40], block[48], block[41], block[34]);
++  v3 = _mm_setr_epi16(
++      block[27], block[20], block[13], block[6],
++      block[7], block[14], block[21], block[28]);
++
++  KLOOP_PREPARE(mask, 0)
++
++  v0 = _mm_setr_epi16(
++      block[35], block[42], block[49], block[56],
++      block[57], block[50], block[43], block[36]);
++  v1 = _mm_setr_epi16(
++      block[29], block[22], block[15], block[23],
++      block[30], block[37], block[44], block[51]);
++  v2 = _mm_setr_epi16(
++      block[58], block[59], block[52], block[45],
++      block[38], block[31], block[39], block[46]);
++  v3 = _mm_setr_epi16(
++      block[53], block[60], block[61], block[54],
++      block[47], block[55], block[62], block[63]);
++#endif
++
++  KLOOP_PREPARE(mask1, 32)
++  mask |= mask1 << 32;
++  mask = ~mask;
++
++  /* Encode the DC coefficient difference per section F.1.2.1 */
++
++  nbits = block_nbits[0];
++  temp = block_diff[0];
++  nbits = nbits ? 32 - __builtin_clz(nbits) : 0;
++
++  /* Emit the Huffman-coded symbol for the number of bits */
++  code = dctbl->ehufco[nbits];
++  size = dctbl->ehufsi[nbits];
++  PUT_CODE(code, size)
++
++  /* Encode the AC coefficients per section F.1.2.2 */
++
++  /* e2k doesn't have a tzcnt instruction */
++  mask = __builtin_e2k_bitrevd(mask) << 1;
++
++  for (i = 1; mask; i++, mask <<= 1) {
++    r = __builtin_clzll(mask);
++    mask <<= r;
++    i += r;
++    /* if run length > 15, must emit special run-length-16 codes (0xF0) */
++    while (r > 15) {
++      PUT_BITS(code_0xf0, size_0xf0)
++      r -= 16;
++    }
++    nbits = block_nbits[i];
++    temp = block_diff[i];
++    nbits = 32 - __builtin_clz(nbits);
++    /* Emit Huffman symbol for run length / number of bits */
++    /* r = r << 4 | nbits; */
++    r = __builtin_e2k_insfd(r, 4 * 63 + 64, nbits);
++    code = actbl->ehufco[r];
++    size = actbl->ehufsi[r];
++    PUT_CODE(code, size)
++  }
++
++  if (i != 64) {
++    PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0])
++  }
++
++  state_ptr->cur.put_buffer = put_buffer;
++  state_ptr->cur.free_bits = free_bits;
++  return buffer;
++}
+diff --git a/simd/e2k/jcphuff-e2k.c b/simd/e2k/jcphuff-e2k.c
+new file mode 100644
+index 0000000..f69afeb
+--- /dev/null
++++ b/simd/e2k/jcphuff-e2k.c
+@@ -0,0 +1,145 @@
++/*
++ * Elbrus optimizations for libjpeg-turbo
++ *
++ * Copyright (C) 2022, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
++ *
++ * This software is provided 'as-is', without any express or implied
++ * warranty.  In no event will the authors be held liable for any damages
++ * arising from the use of this software.
++ *
++ * Permission is granted to anyone to use this software for any purpose,
++ * including commercial applications, and to alter it and redistribute it
++ * freely, subject to the following restrictions:
++ *
++ * 1. The origin of this software must not be misrepresented; you must not
++ *    claim that you wrote the original software. If you use this software
++ *    in a product, an acknowledgment in the product documentation would be
++ *    appreciated but is not required.
++ * 2. Altered source versions must be plainly marked as such, and must not be
++ *    misrepresented as being the original software.
++ * 3. This notice may not be removed or altered from any source distribution.
++ */
++
++#include "jsimd_e2k.h"
++
++#define X(i) coefs[i] = block[jpeg_natural_order_start[i]];
++#define Y(i) coefs[i] = i < rem ? block[jpeg_natural_order_start[i]] : 0;
++
++#define LOOP \
++  for (i = 0; i < Sl >> 4; i++) { \
++    X(0) X(1) X(2) X(3) X(4) X(5) X(6) X(7) \
++    X(8) X(9) X(10) X(11) X(12) X(13) X(14) X(15) \
++    BLOCK16 \
++    jpeg_natural_order_start += 16; \
++  } \
++  rem = Sl & 15; \
++  if (Sl & 8) { \
++    X(0) X(1) X(2) X(3) X(4) X(5) X(6) X(7) \
++    Y(8) Y(9) Y(10) Y(11) Y(12) Y(13) Y(14) \
++    coefs[15] = 0; \
++    BLOCK16 \
++  } else if (rem > 0) { \
++    Y(0) Y(1) Y(2) Y(3) Y(4) Y(5) Y(6) Y(7) \
++    BLOCK8 \
++  }
++
++void jsimd_encode_mcu_AC_first_prepare_e2k
++  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
++   JCOEF *values, size_t *zerobits)
++{
++  JCOEF *diff = values + DCTSIZE2;
++  int16_t __attribute__((aligned(16))) coefs[16];
++  __m128i v0, v1, v2, v3;
++  __m128i c0 = _mm_setzero_si128(), shr = _mm_cvtsi32_si128(Al);
++  int i, rem;
++
++#define BLOCK16 \
++  v0 = _mm_load_si128((__m128i*)coefs); \
++  v1 = _mm_load_si128((__m128i*)coefs + 1); \
++  v2 = _mm_srai_epi16(v0, 15); \
++  v3 = _mm_srai_epi16(v1, 15); \
++  v0 = _mm_sra_epi16(_mm_abs_epi16(v0), shr); \
++  v1 = _mm_sra_epi16(_mm_abs_epi16(v1), shr); \
++  v2 = _mm_xor_si128(v0, v2); \
++  v3 = _mm_xor_si128(v1, v3); \
++  _mm_store_si128((__m128i*)values, v0); \
++  _mm_store_si128((__m128i*)values + 1, v1); \
++  _mm_store_si128((__m128i*)diff, v2); \
++  _mm_store_si128((__m128i*)diff + 1, v3); \
++  v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, v1), c0); \
++  ((uint16_t*)zerobits)[i] = ~_mm_movemask_epi8(v2); \
++  values += 16; diff += 16;
++
++#define BLOCK8 \
++  v0 = _mm_load_si128((__m128i*)coefs); \
++  v2 = _mm_srai_epi16(v0, 15); \
++  v0 = _mm_sra_epi16(_mm_abs_epi16(v0), shr); \
++  v2 = _mm_xor_si128(v0, v2); \
++  _mm_store_si128((__m128i*)values, v0); \
++  _mm_store_si128((__m128i*)diff, v2); \
++  v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, c0), c0); \
++  ((uint16_t*)zerobits)[i] = ~_mm_movemask_epi8(v2); \
++  values += 8; diff += 8;
++
++  ((uint64_t*)zerobits)[0] = 0;
++  LOOP
++#undef BLOCK16
++#undef BLOCK8
++
++  for (i = (64 - Sl) >> 3; i; i--) {
++    _mm_store_si128((__m128i*)values, c0);
++    _mm_store_si128((__m128i*)diff, c0);
++    values += 8; diff += 8;
++  }
++}
++
++int jsimd_encode_mcu_AC_refine_prepare_e2k
++  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
++   JCOEF *absvalues, size_t *bits)
++{
++  union { uint64_t q; uint16_t w[4]; } mask1 = { 0 };
++  int16_t __attribute__((aligned(16))) coefs[16];
++  __m128i v0, v1, v2, c1 = _mm_set1_epi8(1);
++  __m128i c0 = _mm_setzero_si128(), shr = _mm_cvtsi32_si128(Al);
++  int i, rem;
++
++#define BLOCK16 \
++  v0 = _mm_load_si128((__m128i*)coefs); \
++  v1 = _mm_load_si128((__m128i*)coefs + 1); \
++  v2 = _mm_packs_epi16(v0, v1); \
++  ((uint16_t*)bits)[4 + i] = ~_mm_movemask_epi8(v2); \
++  v0 = _mm_sra_epi16(_mm_abs_epi16(v0), shr); \
++  v1 = _mm_sra_epi16(_mm_abs_epi16(v1), shr); \
++  v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, v1), c0); \
++  ((uint16_t*)bits)[i] = ~_mm_movemask_epi8(v2); \
++  _mm_store_si128((__m128i*)absvalues, v0); \
++  _mm_store_si128((__m128i*)absvalues + 1, v1); \
++  v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, v1), c1); \
++  mask1.w[i] = _mm_movemask_epi8(v2); \
++  absvalues += 16;
++
++#define BLOCK8 \
++  v0 = _mm_load_si128((__m128i*)coefs); \
++  v2 = _mm_packs_epi16(v0, c0); \
++  ((uint16_t*)bits)[4 + i] = ~_mm_movemask_epi8(v2); \
++  v0 = _mm_sra_epi16(_mm_abs_epi16(v0), shr); \
++  v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, c0), c0); \
++  ((uint16_t*)bits)[i] = ~_mm_movemask_epi8(v2); \
++  _mm_store_si128((__m128i*)absvalues, v0); \
++  v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, c0), c1); \
++  mask1.w[i] = _mm_movemask_epi8(v2); \
++  absvalues += 8;
++
++  ((uint64_t*)bits)[0] = 0; /* zero */
++  ((uint64_t*)bits)[1] = 0; /* sign */
++  LOOP
++#undef BLOCK16
++#undef BLOCK8
++
++  for (i = (64 - Sl) >> 3; i; i--) {
++    _mm_store_si128((__m128i*)absvalues, c0);
++    absvalues += 8;
++  }
++
++  return 63 - __builtin_clzll(mask1.q | 1);
++}
+diff --git a/simd/e2k/jcsample-e2k.c b/simd/e2k/jcsample-e2k.c
+new file mode 100644
+index 0000000..cac8897
+--- /dev/null
++++ b/simd/e2k/jcsample-e2k.c
+@@ -0,0 +1,203 @@
++/*
++ * Elbrus optimizations for libjpeg-turbo
++ *
++ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
++ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
++ *
++ * This software is provided 'as-is', without any express or implied
++ * warranty.  In no event will the authors be held liable for any damages
++ * arising from the use of this software.
++ *
++ * Permission is granted to anyone to use this software for any purpose,
++ * including commercial applications, and to alter it and redistribute it
++ * freely, subject to the following restrictions:
++ *
++ * 1. The origin of this software must not be misrepresented; you must not
++ *    claim that you wrote the original software. If you use this software
++ *    in a product, an acknowledgment in the product documentation would be
++ *    appreciated but is not required.
++ * 2. Altered source versions must be plainly marked as such, and must not be
++ *    misrepresented as being the original software.
++ * 3. This notice may not be removed or altered from any source distribution.
++ */
++
++/* CHROMA DOWNSAMPLING */
++
++#include "jsimd_e2k.h"
++#include "jcsample.h"
++
++void jsimd_h2v1_downsample_e2k(JDIMENSION image_width,
++                               int max_v_samp_factor,
++                               JDIMENSION v_samp_factor,
++                               JDIMENSION width_in_blocks,
++                               JSAMPARRAY input_data,
++                               JSAMPARRAY output_data)
++{
++  int outcol;
++  JDIMENSION output_cols = width_in_blocks * DCTSIZE, outrow;
++  JSAMPROW inptr, outptr;
++
++  __m128i this0, next0, out;
++  __m128i this0e, this0o, next0e, next0o, outl, outh;
++
++  /* Constants */
++  __m128i pw_bias = _mm_set1_epi32(1 << 16),
++    even_mask = _mm_set1_epi16(255);
++#ifdef NEED_ALIGN8
++  ALIGN8_COMMON
++  ALIGN8_VARS(src)
++#endif
++
++  expand_right_edge(input_data, max_v_samp_factor,
++                    image_width, output_cols * 2);
++
++  if (output_cols > 0)
++  for (outrow = 0; outrow < v_samp_factor; outrow++) {
++    outptr = output_data[outrow];
++    inptr = input_data[outrow];
++
++#ifdef NEED_ALIGN8
++    ALIGN8_START(inptr, src)
++#endif
++    PRAGMA_E2K("ivdep")
++    for (outcol = output_cols; outcol > 8;
++         outcol -= 16, outptr += 16) {
++
++#ifdef NEED_ALIGN8
++      ALIGN8_READ16(this0, src, 0)
++      ALIGN8_READ16(next0, src, 1)
++      src_ptr += 4;
++#else
++      this0 = VEC_LD(inptr);
++      next0 = VEC_LD(inptr + 16);
++      inptr += 32;
++#endif
++      this0e = _mm_and_si128(this0, even_mask);
++      this0o = _mm_srli_epi16(this0, 8);
++      outl = _mm_add_epi16(this0e, this0o);
++      outl = _mm_srli_epi16(_mm_add_epi16(outl, pw_bias), 1);
++      next0e = _mm_and_si128(next0, even_mask);
++      next0o = _mm_srli_epi16(next0, 8);
++      outh = _mm_add_epi16(next0e, next0o);
++      outh = _mm_srli_epi16(_mm_add_epi16(outh, pw_bias), 1);
++
++      out = _mm_packus_epi16(outl, outh);
++      VEC_ST(outptr, out);
++    }
++    if (outcol > 0) {
++#ifdef NEED_ALIGN8
++      ALIGN8_READ16(this0, src, 0)
++#else
++      this0 = VEC_LD(inptr);
++#endif
++      this0e = _mm_and_si128(this0, even_mask);
++      this0o = _mm_srli_epi16(this0, 8);
++      outl = _mm_add_epi16(this0e, this0o);
++      outl = _mm_srli_epi16(_mm_add_epi16(outl, pw_bias), 1);
++
++      out = _mm_packus_epi16(outl, outl);
++      VEC_STL(outptr, out);
++    }
++  }
++}
++
++
++void jsimd_h2v2_downsample_e2k(JDIMENSION image_width, int max_v_samp_factor,
++                               JDIMENSION v_samp_factor,
++                               JDIMENSION width_in_blocks,
++                               JSAMPARRAY input_data, JSAMPARRAY output_data)
++{
++  int outcol;
++  JDIMENSION output_cols = width_in_blocks * DCTSIZE, outrow;
++  JSAMPROW inptr0, inptr1, outptr;
++
++  __m128i this0, next0, this1, next1, out;
++  __m128i this0e, this0o, next0e, next0o, this1e, this1o,
++    next1e, next1o, out0l, out0h, out1l, out1h, outl, outh;
++
++  /* Constants */
++  __m128i pw_bias = _mm_set1_epi32(1 | 2 << 16),
++    even_mask = _mm_set1_epi16(255);
++#ifdef NEED_ALIGN8
++  ALIGN8_COMMON
++  ALIGN8_VARS(src0)
++  ALIGN8_VARS(src1)
++#endif
++
++  expand_right_edge(input_data, max_v_samp_factor,
++                    image_width, output_cols * 2);
++
++  if (output_cols > 0)
++  for (outrow = 0; outrow < v_samp_factor; outrow++) {
++    inptr0 = input_data[outrow * 2];
++    inptr1 = input_data[outrow * 2 + 1];
++    outptr = output_data[outrow];
++
++#ifdef NEED_ALIGN8
++    ALIGN8_START(inptr0, src0)
++    ALIGN8_START(inptr1, src1)
++#endif
++    PRAGMA_E2K("ivdep")
++    for (outcol = output_cols; outcol > 8;
++         outcol -= 16, outptr += 16) {
++
++#ifdef NEED_ALIGN8
++      ALIGN8_READ16(this0, src0, 0) src0_ptr += 2;
++      ALIGN8_READ16(this1, src1, 0) src1_ptr += 2;
++#else
++      this0 = VEC_LD(inptr0); inptr0 += 16;
++      this1 = VEC_LD(inptr1); inptr1 += 16;
++#endif
++      this0e = _mm_and_si128(this0, even_mask);
++      this1e = _mm_and_si128(this1, even_mask);
++      this0o = _mm_srli_epi16(this0, 8);
++      this1o = _mm_srli_epi16(this1, 8);
++      out0l = _mm_add_epi16(this0e, this0o);
++      out1l = _mm_add_epi16(this1e, this1o);
++
++      outl = _mm_add_epi16(out0l, out1l);
++      outl = _mm_srli_epi16(_mm_add_epi16(outl, pw_bias), 2);
++
++#ifdef NEED_ALIGN8
++      ALIGN8_READ16(next0, src0, 0) src0_ptr += 2;
++      ALIGN8_READ16(next1, src1, 0) src1_ptr += 2;
++#else
++      next0 = VEC_LD(inptr0); inptr0 += 16;
++      next1 = VEC_LD(inptr1); inptr1 += 16;
++#endif
++      next0e = _mm_and_si128(next0, even_mask);
++      next1e = _mm_and_si128(next1, even_mask);
++      next0o = _mm_srli_epi16(next0, 8);
++      next1o = _mm_srli_epi16(next1, 8);
++      out0h = _mm_add_epi16(next0e, next0o);
++      out1h = _mm_add_epi16(next1e, next1o);
++
++      outh = _mm_add_epi16(out0h, out1h);
++      outh = _mm_srli_epi16(_mm_add_epi16(outh, pw_bias), 2);
++
++      out = _mm_packus_epi16(outl, outh);
++      VEC_ST(outptr, out);
++    }
++    if (outcol > 0) {
++#ifdef NEED_ALIGN8
++      ALIGN8_READ16(this0, src0, 0)
++      ALIGN8_READ16(this1, src1, 0)
++#else
++      this0 = VEC_LD(inptr0);
++      this1 = VEC_LD(inptr1);
++#endif
++      this0e = _mm_and_si128(this0, even_mask);
++      this1e = _mm_and_si128(this1, even_mask);
++      this0o = _mm_srli_epi16(this0, 8);
++      this1o = _mm_srli_epi16(this1, 8);
++      out0l = _mm_add_epi16(this0e, this0o);
++      out1l = _mm_add_epi16(this1e, this1o);
++
++      outl = _mm_add_epi16(out0l, out1l);
++      outl = _mm_srli_epi16(_mm_add_epi16(outl, pw_bias), 2);
++
++      out = _mm_packus_epi16(outl, outl);
++      VEC_STL(outptr, out);
++    }
++  }
++}
+diff --git a/simd/e2k/jcsample.h b/simd/e2k/jcsample.h
+new file mode 100644
+index 0000000..2ac4816
+--- /dev/null
++++ b/simd/e2k/jcsample.h
+@@ -0,0 +1,28 @@
++/*
++ * jcsample.h
++ *
++ * This file was part of the Independent JPEG Group's software:
++ * Copyright (C) 1991-1996, Thomas G. Lane.
++ * For conditions of distribution and use, see the accompanying README.ijg
++ * file.
++ */
++
++LOCAL(void)
++expand_right_edge(JSAMPARRAY image_data, int num_rows, JDIMENSION input_cols,
++                  JDIMENSION output_cols)
++{
++  register JSAMPROW ptr;
++  register JSAMPLE pixval;
++  register int count;
++  int row;
++  int numcols = (int)(output_cols - input_cols);
++
++  if (numcols > 0) {
++    for (row = 0; row < num_rows; row++) {
++      ptr = image_data[row] + input_cols;
++      pixval = ptr[-1];         /* don't need GETJSAMPLE() here */
++      for (count = numcols; count > 0; count--)
++        *ptr++ = pixval;
++    }
++  }
++}
+diff --git a/simd/e2k/jdcolext-e2k.c b/simd/e2k/jdcolext-e2k.c
+new file mode 100644
+index 0000000..4f12aef
+--- /dev/null
++++ b/simd/e2k/jdcolext-e2k.c
+@@ -0,0 +1,258 @@
++/*
++ * Elbrus optimizations for libjpeg-turbo
++ *
++ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
++ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
++ *
++ * This software is provided 'as-is', without any express or implied
++ * warranty.  In no event will the authors be held liable for any damages
++ * arising from the use of this software.
++ *
++ * Permission is granted to anyone to use this software for any purpose,
++ * including commercial applications, and to alter it and redistribute it
++ * freely, subject to the following restrictions:
++ *
++ * 1. The origin of this software must not be misrepresented; you must not
++ *    claim that you wrote the original software. If you use this software
++ *    in a product, an acknowledgment in the product documentation would be
++ *    appreciated but is not required.
++ * 2. Altered source versions must be plainly marked as such, and must not be
++ *    misrepresented as being the original software.
++ * 3. This notice may not be removed or altered from any source distribution.
++ */
++
++/* This file is included by jdcolor-e2k.c */
++
++void ycc_rgbn_convert(JDIMENSION out_width, JSAMPIMAGE input_buf,
++                      JDIMENSION input_row, JSAMPARRAY output_buf,
++                      int num_rows, int shuf_idx)
++{
++  JSAMPROW outptr, inptr0, inptr1, inptr2;
++  uint8_t __attribute__((aligned(16))) tmpbuf[PIXELSIZE * 16];
++
++  __m128i rgb0, rgb1, rgb2, rgb3, y, cb, cr;
++  __m128i rg0, rg1, bx0, bx1, yl, yh, cbl, cbh,
++    crl, crh, rl, rh, gl, gh, bl, bh, g0w, g1w, g2w, g3w;
++  __m128i g0, g1, g2, g3;
++
++  /* Constants
++   * NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17
++   * high-order bits, not 16.
++   */
++  __m128i pw_f0402 = _mm_set1_epi16(F_0_402 >> 1),
++    pw_mf0228 = _mm_set1_epi16(-F_0_228 >> 1),
++    pw_mf0344_f0285 = _mm_setr_epi16(__4X2(-F_0_344, F_0_285)),
++    pb_255 = _mm_set1_epi8(-1),
++    pw_cj = _mm_set1_epi16(CENTERJSAMPLE),
++    pd_onehalf = _mm_set1_epi32(ONE_HALF),
++    pb_zero = _mm_setzero_si128();
++  RGB_SHUFFLE_INIT
++#ifdef NEED_ALIGN8
++  ALIGN8_COMMON
++  ALIGN8_VARS(src0)
++  ALIGN8_VARS(src1)
++  ALIGN8_VARS(src2)
++#endif
++
++  if (out_width > 0)
++  while (--num_rows >= 0) {
++    int num_cols;
++    inptr0 = input_buf[0][input_row];
++    inptr1 = input_buf[1][input_row];
++    inptr2 = input_buf[2][input_row];
++    input_row++;
++    outptr = *output_buf++;
++
++    if (out_width >= 16) {
++#ifdef NEED_ALIGN8
++      ALIGN8_START(inptr0, src0)
++      ALIGN8_START(inptr1, src1)
++      ALIGN8_START(inptr2, src2)
++      inptr0 += out_width & -16;
++      inptr1 += out_width & -16;
++      inptr2 += out_width & -16;
++#endif
++      PRAGMA_E2K("ivdep")
++      for (num_cols = out_width; num_cols >= 16;
++           num_cols -= 16, outptr += PIXELSIZE * 16) {
++#ifdef NEED_ALIGN8
++        ALIGN8_READ16(y, src0, 0) src0_ptr += 2;
++        ALIGN8_READ16(cb, src1, 0) src1_ptr += 2;
++        ALIGN8_READ16(cr, src2, 0) src2_ptr += 2;
++#else
++        y = VEC_LD(inptr0); inptr0 += 16;
++        cb = VEC_LD(inptr1); inptr1 += 16;
++        cr = VEC_LD(inptr2); inptr2 += 16;
++#endif
++        CALC_RGB
++        RGB_SHUFFLE
++        VEC_ST(outptr, rgb0);
++        VEC_ST(outptr + 16, rgb1);
++        VEC_ST(outptr + 32, rgb2);
++#if PIXELSIZE == 4
++        VEC_ST(outptr + 48, rgb3);
++#endif
++      }
++    }
++
++    num_cols = out_width & 15;
++    if (num_cols) {
++      int i;
++      for (i = 0; i < num_cols; i++) {
++        tmpbuf[i] = inptr0[i];
++        tmpbuf[i + 16] = inptr1[i];
++        tmpbuf[i + 32] = inptr2[i];
++      }
++      y = VEC_LD(tmpbuf);
++      cb = VEC_LD(tmpbuf + 16);
++      cr = VEC_LD(tmpbuf + 32);
++      CALC_RGB
++      RGB_SHUFFLE
++      VEC_ST(tmpbuf, rgb0);
++      VEC_ST(tmpbuf + 16, rgb1);
++      VEC_ST(tmpbuf + 32, rgb2);
++#if PIXELSIZE == 4
++      VEC_ST(tmpbuf + 48, rgb3);
++#endif
++      memcpy(outptr, tmpbuf, num_cols * PIXELSIZE);
++    }
++  }
++}
++
++void ycc_rgbn_merged(JDIMENSION out_width, JSAMPIMAGE input_buf,
++                     JDIMENSION in_row_group_ctr,
++                     JDIMENSION in_row_group_ctr_y,
++                     JSAMPARRAY output_buf, int shuf_idx)
++{
++  JSAMPROW outptr, inptr0, inptr1, inptr2;
++  int num_cols;
++  uint8_t __attribute__((aligned(16))) tmpbuf[4 * 16];
++
++  __m128i rgb0, rgb1, rgb2, rgb3, y, cb, cr;
++  __m128i rg0, rg1, bx0, bx1, yl, yh, cbl, cbh,
++    crl, crh, r_yl, r_yh, g_yl, g_yh, b_yl, b_yh, g_y0w, g_y1w, g_y2w, g_y3w,
++    rl, rh, gl, gh, bl, bh;
++  __m128i g_y0, g_y1, g_y2, g_y3;
++
++  /* Constants
++   * NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17
++   * high-order bits, not 16.
++   */
++  __m128i pw_f0402 = _mm_set1_epi16(F_0_402 >> 1),
++    pw_mf0228 = _mm_set1_epi16(-F_0_228 >> 1),
++    pw_mf0344_f0285 = _mm_setr_epi16(__4X2(-F_0_344, F_0_285)),
++    pb_255 = _mm_set1_epi8(-1),
++    pw_cj = _mm_set1_epi16(CENTERJSAMPLE),
++    pd_onehalf = _mm_set1_epi32(ONE_HALF),
++    pb_zero = _mm_setzero_si128();
++  RGB_SHUFFLE_INIT
++#ifdef NEED_ALIGN8
++  ALIGN8_COMMON
++  ALIGN8_VARS(src0)
++  ALIGN8_VARS(src1)
++  ALIGN8_VARS(src2)
++#endif
++
++  inptr0 = input_buf[0][in_row_group_ctr_y];
++  inptr1 = input_buf[1][in_row_group_ctr];
++  inptr2 = input_buf[2][in_row_group_ctr];
++  outptr = output_buf[0];
++
++  if (out_width >= 32) {
++#ifdef NEED_ALIGN8
++    ALIGN8_START(inptr0, src0)
++    ALIGN8_START(inptr1, src1)
++    ALIGN8_START(inptr2, src2)
++    inptr0 += out_width & -32;
++    inptr1 += (out_width & -32) >> 1;
++    inptr2 += (out_width & -32) >> 1;
++#endif
++    PRAGMA_E2K("ivdep")
++    for (num_cols = out_width; num_cols >= 32; num_cols -= 32) {
++
++#ifdef NEED_ALIGN8
++      ALIGN8_READ16(cb, src1, 0) src1_ptr += 2;
++      ALIGN8_READ16(cr, src2, 0) src2_ptr += 2;
++#else
++      cb = VEC_LD(inptr1); inptr1 += 16;
++      cr = VEC_LD(inptr2); inptr2 += 16;
++#endif
++      CALC_MERGED1
++
++#ifdef NEED_ALIGN8
++      ALIGN8_READ16(y, src0, 0) src0_ptr += 2;
++#else
++      y = VEC_LD(inptr0); inptr0 += 16;
++#endif
++      CALC_MERGED2(r_yl, g_yl, b_yl)
++      RGB_SHUFFLE
++      VEC_ST(outptr, rgb0);
++      VEC_ST(outptr + 16, rgb1);
++      VEC_ST(outptr + 32, rgb2);
++#if PIXELSIZE == 4
++      VEC_ST(outptr + 48, rgb3);
++#endif
++      outptr += PIXELSIZE * 16;
++
++#ifdef NEED_ALIGN8
++      ALIGN8_READ16(y, src0, 0) src0_ptr += 2;
++#else
++      y = VEC_LD(inptr0); inptr0 += 16;
++#endif
++      CALC_MERGED2(r_yh, g_yh, b_yh)
++      RGB_SHUFFLE
++      VEC_ST(outptr, rgb0);
++      VEC_ST(outptr + 16, rgb1);
++      VEC_ST(outptr + 32, rgb2);
++#if PIXELSIZE == 4
++      VEC_ST(outptr + 48, rgb3);
++#endif
++      outptr += PIXELSIZE * 16;
++    }
++  }
++
++  num_cols = out_width & 31;
++  if (num_cols) {
++    int i;
++    for (i = 0; i < (num_cols + 1) >> 1; i++) {
++      tmpbuf[i] = inptr1[i];
++      tmpbuf[i + 16] = inptr2[i];
++      tmpbuf[i * 2 + 32] = inptr0[i * 2];
++      tmpbuf[i * 2 + 32 + 1] = inptr0[i * 2 + 1];
++    }
++    cb = VEC_LD(tmpbuf);
++    cr = VEC_LD(tmpbuf + 16);
++    CALC_MERGED1
++
++    y = VEC_LD(tmpbuf + 32);
++    CALC_MERGED2(r_yl, g_yl, b_yl)
++    RGB_SHUFFLE
++    if (num_cols >= 16) {
++      VEC_ST(outptr, rgb0);
++      VEC_ST(outptr + 16, rgb1);
++      VEC_ST(outptr + 32, rgb2);
++#if PIXELSIZE == 4
++      VEC_ST(outptr + 48, rgb3);
++#endif
++      outptr += PIXELSIZE * 16;
++
++      y = VEC_LD(tmpbuf + 48);
++      CALC_MERGED2(r_yh, g_yh, b_yh)
++      RGB_SHUFFLE
++    }
++    VEC_ST(tmpbuf, rgb0);
++    VEC_ST(tmpbuf + 16, rgb1);
++    VEC_ST(tmpbuf + 32, rgb2);
++#if PIXELSIZE == 4
++    VEC_ST(tmpbuf + 48, rgb3);
++#endif
++    memcpy(outptr, tmpbuf, (out_width & 15) * PIXELSIZE);
++  }
++}
++
++#undef RGB_SHUFFLE_INIT
++#undef RGB_SHUFFLE
++#undef PIXELSIZE
++#undef ycc_rgbn_convert
++#undef ycc_rgbn_merged
++
+diff --git a/simd/e2k/jdcolor-e2k.c b/simd/e2k/jdcolor-e2k.c
+new file mode 100644
+index 0000000..94c80e9
+--- /dev/null
++++ b/simd/e2k/jdcolor-e2k.c
+@@ -0,0 +1,289 @@
++/*
++ * Elbrus optimizations for libjpeg-turbo
++ *
++ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
++ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
++ *
++ * This software is provided 'as-is', without any express or implied
++ * warranty.  In no event will the authors be held liable for any damages
++ * arising from the use of this software.
++ *
++ * Permission is granted to anyone to use this software for any purpose,
++ * including commercial applications, and to alter it and redistribute it
++ * freely, subject to the following restrictions:
++ *
++ * 1. The origin of this software must not be misrepresented; you must not
++ *    claim that you wrote the original software. If you use this software
++ *    in a product, an acknowledgment in the product documentation would be
++ *    appreciated but is not required.
++ * 2. Altered source versions must be plainly marked as such, and must not be
++ *    misrepresented as being the original software.
++ * 3. This notice may not be removed or altered from any source distribution.
++ */
++
++/* YCC --> RGB CONVERSION */
++
++#include "jsimd_e2k.h"
++
++#define F_0_344  22554              /* FIX(0.34414) */
++#define F_0_714  46802              /* FIX(0.71414) */
++#define F_1_402  91881              /* FIX(1.40200) */
++#define F_1_772  116130             /* FIX(1.77200) */
++#define F_0_402  (F_1_402 - 65536)  /* FIX(1.40200) - FIX(1) */
++#define F_0_285  (65536 - F_0_714)  /* FIX(1) - FIX(0.71414) */
++#define F_0_228  (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */
++
++#define SCALEBITS  16
++#define ONE_HALF  (1 << (SCALEBITS - 1))
++
++static const uint8_t __attribute__((aligned(16)))
++#if defined(__iset__) && __iset__ >= 5
++ycc_rgb_shuf_const[7][48] = {
++#define SHUF_CONST3          \
++  C0,      C1,      C2,      \
++  C0 + 4,  C1 + 4,  C2 + 4,  \
++  C0 + 8,  C1 + 8,  C2 + 8,  \
++  C0 + 12, C1 + 12, C2 + 12, \
++  C0 + 16, C1 + 16, C2 + 16, \
++  C0 + 20, C1 + 4,  C2 + 4,  \
++  C0 + 8,  C1 + 8,  C2 + 8,  \
++  C0 + 12, C1 + 12, C2 + 12, \
++  C0 + 16, C1 + 16, C2 + 16, \
++  C0 + 20, C1 + 20, C2 + 20, \
++  C0 + 24, C1 + 24, C2 + 8,  \
++  C0 + 12, C1 + 12, C2 + 12, \
++  C0 + 16, C1 + 16, C2 + 16, \
++  C0 + 20, C1 + 20, C2 + 20, \
++  C0 + 24, C1 + 24, C2 + 24, \
++  C0 + 28, C1 + 28, C2 + 28
++#else
++ycc_rgb_shuf_const[7][24] = {
++#define SHUF_CONST3          \
++  C0,      C1,      C2,      \
++  C0 + 4,  C1 + 4,  C2 + 4,  \
++  C0 + 8,  C1 + 8,  C2,      \
++  C0 + 4,  C1 + 4,  C2 + 4,  \
++  C0 + 8,  C1 + 8,  C2 + 8,  \
++  C0 + 12, C1 + 4,  C2 + 4,  \
++  C0 + 8,  C1 + 8,  C2 + 8,  \
++  C0 + 12, C1 + 12, C2 + 12
++#endif
++
++#define SHUF_CONST4 C0, C1, C2, C3, C0 + 4, C1 + 4, C2 + 4, C3 + 4, \
++  C0 + 8, C1 + 8, C2 + 8, C3 + 8, C0 + 12, C1 + 12, C2 + 12, C3 + 12
++
++#define TMP_RED    RGB_RED
++#define TMP_GREEN  RGB_GREEN
++#define TMP_BLUE   RGB_BLUE
++#define PIXELSIZE  RGB_PIXELSIZE
++#include "jdcoltab-e2k.c"
++  ,
++#define TMP_RED    EXT_RGB_RED
++#define TMP_GREEN  EXT_RGB_GREEN
++#define TMP_BLUE   EXT_RGB_BLUE
++#define PIXELSIZE  EXT_RGB_PIXELSIZE
++#include "jdcoltab-e2k.c"
++  ,
++#define TMP_RED    EXT_RGBX_RED
++#define TMP_GREEN  EXT_RGBX_GREEN
++#define TMP_BLUE   EXT_RGBX_BLUE
++#define PIXELSIZE  EXT_RGBX_PIXELSIZE
++#include "jdcoltab-e2k.c"
++  ,
++#define TMP_RED    EXT_BGR_RED
++#define TMP_GREEN  EXT_BGR_GREEN
++#define TMP_BLUE   EXT_BGR_BLUE
++#define PIXELSIZE  EXT_BGR_PIXELSIZE
++#include "jdcoltab-e2k.c"
++  ,
++#define TMP_RED    EXT_BGRX_RED
++#define TMP_GREEN  EXT_BGRX_GREEN
++#define TMP_BLUE   EXT_BGRX_BLUE
++#define PIXELSIZE  EXT_BGRX_PIXELSIZE
++#include "jdcoltab-e2k.c"
++  ,
++#define TMP_RED    EXT_XBGR_RED
++#define TMP_GREEN  EXT_XBGR_GREEN
++#define TMP_BLUE   EXT_XBGR_BLUE
++#define PIXELSIZE  EXT_XBGR_PIXELSIZE
++#include "jdcoltab-e2k.c"
++  ,
++#define TMP_RED    EXT_XRGB_RED
++#define TMP_GREEN  EXT_XRGB_GREEN
++#define TMP_BLUE   EXT_XRGB_BLUE
++#define PIXELSIZE  EXT_XRGB_PIXELSIZE
++#include "jdcoltab-e2k.c"
++};
++
++  /* (Original)
++   * R = Y                + 1.40200 * Cr
++   * G = Y - 0.34414 * Cb - 0.71414 * Cr
++   * B = Y + 1.77200 * Cb
++   *
++   * (This implementation)
++   * R = Y                + 0.40200 * Cr + Cr
++   * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
++   * B = Y - 0.22800 * Cb + Cb + Cb
++   */
++
++#define CALC_RGB \
++  yl = _mm_unpacklo_epi8(y, pb_zero); \
++  yh = _mm_unpackhi_epi8(y, pb_zero); \
++  \
++  cbl = _mm_unpacklo_epi8(cb, pb_zero); \
++  cbh = _mm_unpackhi_epi8(cb, pb_zero); \
++  cbl = _mm_sub_epi16(cbl, pw_cj); \
++  cbh = _mm_sub_epi16(cbh, pw_cj); \
++  \
++  crl = _mm_unpacklo_epi8(cr, pb_zero); \
++  crh = _mm_unpackhi_epi8(cr, pb_zero); \
++  crl = _mm_sub_epi16(crl, pw_cj); \
++  crh = _mm_sub_epi16(crh, pw_cj); \
++  \
++  bl = _mm_mulhrs_epi16(cbl, pw_mf0228); \
++  bh = _mm_mulhrs_epi16(cbh, pw_mf0228); \
++  bl = _mm_add_epi16(bl, _mm_add_epi16(cbl, cbl)); \
++  bh = _mm_add_epi16(bh, _mm_add_epi16(cbh, cbh)); \
++  bl = _mm_add_epi16(bl, yl); \
++  bh = _mm_add_epi16(bh, yh); \
++  \
++  rl = _mm_mulhrs_epi16(crl, pw_f0402); \
++  rh = _mm_mulhrs_epi16(crh, pw_f0402); \
++  rl = _mm_add_epi16(rl, crl); \
++  rh = _mm_add_epi16(rh, crh); \
++  rl = _mm_add_epi16(rl, yl); \
++  rh = _mm_add_epi16(rh, yh); \
++  \
++  g0w = _mm_unpacklo_epi16(cbl, crl); \
++  g1w = _mm_unpackhi_epi16(cbl, crl); \
++  g0 = _mm_add_epi32(_mm_madd_epi16(g0w, pw_mf0344_f0285), pd_onehalf); \
++  g1 = _mm_add_epi32(_mm_madd_epi16(g1w, pw_mf0344_f0285), pd_onehalf); \
++  g2w = _mm_unpacklo_epi16(cbh, crh); \
++  g3w = _mm_unpackhi_epi16(cbh, crh); \
++  g2 = _mm_add_epi32(_mm_madd_epi16(g2w, pw_mf0344_f0285), pd_onehalf); \
++  g3 = _mm_add_epi32(_mm_madd_epi16(g3w, pw_mf0344_f0285), pd_onehalf); \
++  \
++  gl = _mm_packhi_epi32(g0, g1); \
++  gh = _mm_packhi_epi32(g2, g3); \
++  gl = _mm_sub_epi16(gl, crl); \
++  gh = _mm_sub_epi16(gh, crh); \
++  gl = _mm_add_epi16(gl, yl); \
++  gh = _mm_add_epi16(gh, yh); \
++  \
++  rl = _mm_packus_epi16(rl, rh); \
++  gl = _mm_packus_epi16(gl, gh); \
++  bl = _mm_packus_epi16(bl, bh); \
++  \
++  rg0 = _mm_unpacklo_epi8(rl, gl); \
++  rg1 = _mm_unpackhi_epi8(rl, gl); \
++  bx0 = _mm_unpacklo_epi8(bl, pb_255); \
++  bx1 = _mm_unpackhi_epi8(bl, pb_255); \
++  \
++  rgb0 = _mm_unpacklo_epi16(rg0, bx0); \
++  rgb1 = _mm_unpackhi_epi16(rg0, bx0); \
++  rgb2 = _mm_unpacklo_epi16(rg1, bx1); \
++  rgb3 = _mm_unpackhi_epi16(rg1, bx1);
++
++#define CALC_MERGED1 \
++  cbl = _mm_unpacklo_epi8(cb, pb_zero); \
++  cbh = _mm_unpackhi_epi8(cb, pb_zero); \
++  cbl = _mm_sub_epi16(cbl, pw_cj); \
++  cbh = _mm_sub_epi16(cbh, pw_cj); \
++  \
++  crl = _mm_unpacklo_epi8(cr, pb_zero); \
++  crh = _mm_unpackhi_epi8(cr, pb_zero); \
++  crl = _mm_sub_epi16(crl, pw_cj); \
++  crh = _mm_sub_epi16(crh, pw_cj); \
++  \
++  b_yl = _mm_mulhrs_epi16(cbl, pw_mf0228); \
++  b_yh = _mm_mulhrs_epi16(cbh, pw_mf0228); \
++  b_yl = _mm_add_epi16(b_yl, _mm_add_epi16(cbl, cbl)); \
++  b_yh = _mm_add_epi16(b_yh, _mm_add_epi16(cbh, cbh)); \
++  \
++  r_yl = _mm_mulhrs_epi16(crl, pw_f0402); \
++  r_yh = _mm_mulhrs_epi16(crh, pw_f0402); \
++  r_yl = _mm_add_epi16(r_yl, crl); \
++  r_yh = _mm_add_epi16(r_yh, crh); \
++  \
++  g_y0w = _mm_unpacklo_epi16(cbl, crl); \
++  g_y1w = _mm_unpackhi_epi16(cbl, crl); \
++  g_y0 = _mm_add_epi32(_mm_madd_epi16(g_y0w, pw_mf0344_f0285), pd_onehalf); \
++  g_y1 = _mm_add_epi32(_mm_madd_epi16(g_y1w, pw_mf0344_f0285), pd_onehalf); \
++  g_y2w = _mm_unpacklo_epi16(cbh, crh); \
++  g_y3w = _mm_unpackhi_epi16(cbh, crh); \
++  g_y2 = _mm_add_epi32(_mm_madd_epi16(g_y2w, pw_mf0344_f0285), pd_onehalf); \
++  g_y3 = _mm_add_epi32(_mm_madd_epi16(g_y3w, pw_mf0344_f0285), pd_onehalf); \
++  \
++  g_yl = _mm_packhi_epi32(g_y0, g_y1); \
++  g_yh = _mm_packhi_epi32(g_y2, g_y3); \
++  g_yl = _mm_sub_epi16(g_yl, crl); \
++  g_yh = _mm_sub_epi16(g_yh, crh);
++
++#define CALC_MERGED2(r_yl, g_yl, b_yl) \
++  yl = _mm_unpacklo_epi8(y, pb_zero); \
++  yh = _mm_unpackhi_epi8(y, pb_zero); \
++  bl = _mm_add_epi16(_mm_unpacklo_epi16(b_yl, b_yl), yl); \
++  bh = _mm_add_epi16(_mm_unpackhi_epi16(b_yl, b_yl), yh); \
++  rl = _mm_add_epi16(_mm_unpacklo_epi16(r_yl, r_yl), yl); \
++  rh = _mm_add_epi16(_mm_unpackhi_epi16(r_yl, r_yl), yh); \
++  gl = _mm_add_epi16(_mm_unpacklo_epi16(g_yl, g_yl), yl); \
++  gh = _mm_add_epi16(_mm_unpackhi_epi16(g_yl, g_yl), yh); \
++  rl = _mm_packus_epi16(rl, rh); \
++  gl = _mm_packus_epi16(gl, gh); \
++  bl = _mm_packus_epi16(bl, bh); \
++  \
++  rg0 = _mm_unpacklo_epi8(rl, gl); \
++  rg1 = _mm_unpackhi_epi8(rl, gl); \
++  bx0 = _mm_unpacklo_epi8(bl, pb_255); \
++  bx1 = _mm_unpackhi_epi8(bl, pb_255); \
++  \
++  rgb0 = _mm_unpacklo_epi16(rg0, bx0); \
++  rgb1 = _mm_unpackhi_epi16(rg0, bx0); \
++  rgb2 = _mm_unpacklo_epi16(rg1, bx1); \
++  rgb3 = _mm_unpackhi_epi16(rg1, bx1);
++
++#define PIXELSIZE 3
++#if defined(__iset__) && __iset__ >= 5
++#define RGB_SHUFFLE_INIT __m128i \
++  rgb_index0 = VEC_LD(ycc_rgb_shuf_const[shuf_idx]), \
++  rgb_index1 = VEC_LD(ycc_rgb_shuf_const[shuf_idx] + 16), \
++  rgb_index2 = VEC_LD(ycc_rgb_shuf_const[shuf_idx] + 32);
++#define RGB_SHUFFLE \
++  rgb0 = _mm_shuffle2_epi8(rgb0, rgb1, rgb_index0); \
++  rgb1 = _mm_shuffle2_epi8(rgb1, rgb2, rgb_index1); \
++  rgb2 = _mm_shuffle2_epi8(rgb2, rgb3, rgb_index2);
++#else
++#define RGB_SHUFFLE_INIT __m64 \
++  rgb_index0 = *(__m64*)ycc_rgb_shuf_const[shuf_idx], \
++  rgb_index1 = *(__m64*)(ycc_rgb_shuf_const[shuf_idx] + 8), \
++  rgb_index2 = *(__m64*)(ycc_rgb_shuf_const[shuf_idx] + 16);
++#define RGB_SHUFFLE { \
++  union { __m128i v; __m64 d[2]; } a = { rgb0 }, \
++    b = { rgb1 }, c = { rgb2 }, d = { rgb3 }; \
++  a.d[0] = _mm_shuffle2_pi8(a.d[0], a.d[1], rgb_index0); \
++  a.d[1] = _mm_shuffle2_pi8(a.d[1], b.d[0], rgb_index1); \
++  b.d[0] = _mm_shuffle2_pi8(b.d[0], b.d[1], rgb_index2); \
++  b.d[1] = _mm_shuffle2_pi8(c.d[0], c.d[1], rgb_index0); \
++  c.d[0] = _mm_shuffle2_pi8(c.d[1], d.d[0], rgb_index1); \
++  c.d[1] = _mm_shuffle2_pi8(d.d[0], d.d[1], rgb_index2); \
++  rgb0 = a.v; rgb1 = b.v; rgb2 = c.v; \
++}
++#endif
++
++#define ycc_rgbn_convert jsimd_ycc_rgb3_convert_e2k
++#define ycc_rgbn_merged  jsimd_ycc_rgb3_merged_upsample_e2k
++#include "jdcolext-e2k.c"
++
++#define PIXELSIZE 4
++#define RGB_SHUFFLE_INIT __m128i \
++  rgb_index0 = VEC_LD(ycc_rgb_shuf_const[shuf_idx]);
++#define RGB_SHUFFLE \
++  rgb0 = _mm_shuffle_epi8(rgb0, rgb_index0); \
++  rgb1 = _mm_shuffle_epi8(rgb1, rgb_index0); \
++  rgb2 = _mm_shuffle_epi8(rgb2, rgb_index0); \
++  rgb3 = _mm_shuffle_epi8(rgb3, rgb_index0);
++
++#define ycc_rgbn_convert jsimd_ycc_rgb4_convert_e2k
++#define ycc_rgbn_merged  jsimd_ycc_rgb4_merged_upsample_e2k
++#include "jdcolext-e2k.c"
++
+diff --git a/simd/e2k/jdcoltab-e2k.c b/simd/e2k/jdcoltab-e2k.c
+new file mode 100644
+index 0000000..e19666d
+--- /dev/null
++++ b/simd/e2k/jdcoltab-e2k.c
+@@ -0,0 +1,80 @@
++/*
++ * Elbrus optimizations for libjpeg-turbo
++ *
++ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
++ *
++ * This software is provided 'as-is', without any express or implied
++ * warranty.  In no event will the authors be held liable for any damages
++ * arising from the use of this software.
++ *
++ * Permission is granted to anyone to use this software for any purpose,
++ * including commercial applications, and to alter it and redistribute it
++ * freely, subject to the following restrictions:
++ *
++ * 1. The origin of this software must not be misrepresented; you must not
++ *    claim that you wrote the original software. If you use this software
++ *    in a product, an acknowledgment in the product documentation would be
++ *    appreciated but is not required.
++ * 2. Altered source versions must be plainly marked as such, and must not be
++ *    misrepresented as being the original software.
++ * 3. This notice may not be removed or altered from any source distribution.
++ */
++
++/* This file is included by jdcolor-e2k.c */
++
++#if TMP_RED == 0
++#define C0 0
++#elif TMP_GREEN == 0
++#define C0 1
++#elif TMP_BLUE == 0
++#define C0 2
++#else
++#define C0 3
++#endif
++
++#if TMP_RED == 1
++#define C1 0
++#elif TMP_GREEN == 1
++#define C1 1
++#elif TMP_BLUE == 1
++#define C1 2
++#else
++#define C1 3
++#endif
++
++#if TMP_RED == 2
++#define C2 0
++#elif TMP_GREEN == 2
++#define C2 1
++#elif TMP_BLUE == 2
++#define C2 2
++#else
++#define C2 3
++#endif
++
++#if TMP_RED == 3
++#define C3 0
++#elif TMP_GREEN == 3
++#define C3 1
++#elif TMP_BLUE == 3
++#define C3 2
++#else
++#define C3 3
++#endif
++
++#if PIXELSIZE == 3
++{ SHUF_CONST3 }
++#else
++{ SHUF_CONST4 }
++#endif
++
++#undef C0
++#undef C1
++#undef C2
++#undef C3
++
++#undef TMP_RED
++#undef TMP_GREEN
++#undef TMP_BLUE
++#undef PIXELSIZE
++
+diff --git a/simd/e2k/jdsample-e2k.c b/simd/e2k/jdsample-e2k.c
+new file mode 100644
+index 0000000..572b3af
+--- /dev/null
++++ b/simd/e2k/jdsample-e2k.c
+@@ -0,0 +1,389 @@
++/*
++ * Elbrus optimizations for libjpeg-turbo
++ *
++ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
++ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
++ *
++ * This software is provided 'as-is', without any express or implied
++ * warranty.  In no event will the authors be held liable for any damages
++ * arising from the use of this software.
++ *
++ * Permission is granted to anyone to use this software for any purpose,
++ * including commercial applications, and to alter it and redistribute it
++ * freely, subject to the following restrictions:
++ *
++ * 1. The origin of this software must not be misrepresented; you must not
++ *    claim that you wrote the original software. If you use this software
++ *    in a product, an acknowledgment in the product documentation would be
++ *    appreciated but is not required.
++ * 2. Altered source versions must be plainly marked as such, and must not be
++ *    misrepresented as being the original software.
++ * 3. This notice may not be removed or altered from any source distribution.
++ */
++
++/* CHROMA UPSAMPLING */
++
++#include "jsimd_e2k.h"
++
++
++void jsimd_h2v1_fancy_upsample_e2k(int max_v_samp_factor,
++                                   JDIMENSION downsampled_width,
++                                   JSAMPARRAY input_data,
++                                   JSAMPARRAY *output_data_ptr)
++{
++  JSAMPARRAY output_data = *output_data_ptr;
++  JSAMPROW inptr, outptr;
++  int inrow, incol;
++
++  __m128i pb_zero = _mm_setzero_si128();
++  __m128i this0, last0, p_last0, next0 = pb_zero, p_next0, out;
++  __m128i this0l, this0h, last0l, last0h,
++    next0l, next0h, outle, outhe, outlo, outho;
++#ifdef NEED_ALIGN8
++  ALIGN8_COMMON
++  ALIGN8_VARS(src)
++#endif
++
++  /* Constants */
++  __m128i pw_three = _mm_set1_epi16(3),
++    next_index_lastcol = _mm_setr_epi8(
++       1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15),
++    pw_one = _mm_set1_epi16(1), pw_two = _mm_set1_epi16(2);
++
++  if (downsampled_width > 0)
++  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
++    inptr = input_data[inrow];
++    outptr = output_data[inrow];
++
++    if (downsampled_width & 15)
++      inptr[downsampled_width] = inptr[downsampled_width - 1];
++
++#ifdef NEED_ALIGN8
++    ALIGN8_START(inptr, src)
++    ALIGN8_READ16(this0, src, 0)
++#else
++    this0 = VEC_LD(inptr);
++#endif
++    last0 = _mm_bslli_si128(this0, 15);
++
++    PRAGMA_E2K("ivdep")
++    for (incol = downsampled_width; incol > 0;
++         incol -= 16, outptr += 32) {
++
++      p_last0 = _mm_alignr_epi8(this0, last0, 15);
++      last0 = this0;
++
++      if (__builtin_expect(incol <= 16, 0))
++        p_next0 = _mm_shuffle_epi8(this0, next_index_lastcol);
++      else {
++#ifdef NEED_ALIGN8
++        ALIGN8_READ16(next0, src, 1) src_ptr += 2;
++#else
++        next0 = VEC_LD(inptr + 16); inptr += 16;
++#endif
++        p_next0 = _mm_alignr_epi8(next0, this0, 1);
++      }
++
++      this0l = _mm_mullo_epi16(_mm_unpacklo_epi8(this0, pb_zero), pw_three);
++      last0l = _mm_unpacklo_epi8(p_last0, pb_zero);
++      next0l = _mm_unpacklo_epi8(p_next0, pb_zero);
++      last0l = _mm_add_epi16(last0l, pw_one);
++      next0l = _mm_add_epi16(next0l, pw_two);
++
++      outle = _mm_add_epi16(this0l, last0l);
++      outlo = _mm_add_epi16(this0l, next0l);
++      outle = _mm_srli_epi16(outle, 2);
++      outlo = _mm_srli_epi16(outlo, 2);
++
++      out = _mm_or_si128(outle, _mm_slli_epi16(outlo, 8));
++      VEC_ST(outptr, out);
++
++      if (__builtin_expect(incol <= 8, 0)) break;
++
++      this0h = _mm_mullo_epi16(_mm_unpackhi_epi8(this0, pb_zero), pw_three);
++      last0h = _mm_unpackhi_epi8(p_last0, pb_zero);
++      next0h = _mm_unpackhi_epi8(p_next0, pb_zero);
++      last0h = _mm_add_epi16(last0h, pw_one);
++      next0h = _mm_add_epi16(next0h, pw_two);
++
++      outhe = _mm_add_epi16(this0h, last0h);
++      outho = _mm_add_epi16(this0h, next0h);
++      outhe = _mm_srli_epi16(outhe, 2);
++      outho = _mm_srli_epi16(outho, 2);
++
++      out = _mm_or_si128(outhe, _mm_slli_epi16(outho, 8));
++      VEC_ST(outptr + 16, out);
++
++      this0 = next0;
++    }
++  }
++}
++
++
++void jsimd_h2v2_fancy_upsample_e2k(int max_v_samp_factor,
++                                   JDIMENSION downsampled_width,
++                                   JSAMPARRAY input_data,
++                                   JSAMPARRAY *output_data_ptr)
++{
++  JSAMPARRAY output_data = *output_data_ptr;
++  JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
++  int inrow, outrow, incol;
++#ifdef NEED_ALIGN8
++  ALIGN8_COMMON
++  ALIGN8_VARS(src_1)
++  ALIGN8_VARS(src0)
++  ALIGN8_VARS(src1)
++#endif
++
++  __m128i pb_zero = _mm_setzero_si128();
++  __m128i this_1, this0, this1, out;
++  __m128i this_1l, this_1h, this0l, this0h, this1l, this1h,
++    lastcolsum_1h, lastcolsum1h,
++    p_lastcolsum_1l, p_lastcolsum_1h, p_lastcolsum1l, p_lastcolsum1h,
++    thiscolsum_1l, thiscolsum_1h, thiscolsum1l, thiscolsum1h,
++    nextcolsum_1l = pb_zero, nextcolsum_1h = pb_zero,
++    nextcolsum1l = pb_zero, nextcolsum1h = pb_zero,
++    p_nextcolsum_1l, p_nextcolsum_1h, p_nextcolsum1l, p_nextcolsum1h,
++    tmpl, tmph, outle, outhe, outlo, outho;
++
++  /* Constants */
++  __m128i pw_three = _mm_set1_epi16(3),
++    pw_seven = _mm_set1_epi16(7), pw_eight = _mm_set1_epi16(8),
++    next_index_lastcol = _mm_setr_epi8(
++       2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 14, 15);
++
++  if (downsampled_width > 0)
++  for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
++
++    inptr_1 = input_data[inrow - 1];
++    inptr0 = input_data[inrow];
++    inptr1 = input_data[inrow + 1];
++    outptr0 = output_data[outrow++];
++    outptr1 = output_data[outrow++];
++
++    if (downsampled_width & 15) {
++      inptr_1[downsampled_width] = inptr_1[downsampled_width - 1];
++      inptr0[downsampled_width] = inptr0[downsampled_width - 1];
++      inptr1[downsampled_width] = inptr1[downsampled_width - 1];
++    }
++
++#ifdef NEED_ALIGN8
++    ALIGN8_START(inptr0, src0)
++    ALIGN8_START(inptr_1, src_1)
++    ALIGN8_START(inptr1, src1)
++    ALIGN8_READ16(this0, src0, 0)
++    ALIGN8_READ16(this_1, src_1, 0)
++    ALIGN8_READ16(this1, src1, 0)
++#else
++    this0 = VEC_LD(inptr0);
++    this_1 = VEC_LD(inptr_1);
++    this1 = VEC_LD(inptr1);
++#endif
++
++    this0l = _mm_unpacklo_epi8(this0, pb_zero);
++    this0h = _mm_unpackhi_epi8(this0, pb_zero);
++    this0l = _mm_mullo_epi16(this0l, pw_three);
++    this0h = _mm_mullo_epi16(this0h, pw_three);
++
++    this_1l = _mm_unpacklo_epi8(this_1, pb_zero);
++    this_1h = _mm_unpackhi_epi8(this_1, pb_zero);
++    thiscolsum_1l = _mm_add_epi16(this0l, this_1l);
++    thiscolsum_1h = _mm_add_epi16(this0h, this_1h);
++    lastcolsum_1h = _mm_bslli_si128(thiscolsum_1l, 14);;
++
++    this1l = _mm_unpacklo_epi8(this1, pb_zero);
++    this1h = _mm_unpackhi_epi8(this1, pb_zero);
++    thiscolsum1l = _mm_add_epi16(this0l, this1l);
++    thiscolsum1h = _mm_add_epi16(this0h, this1h);
++    lastcolsum1h = _mm_bslli_si128(thiscolsum1l, 14);
++
++    PRAGMA_E2K("ivdep")
++    for (incol = downsampled_width; incol > 0;
++         incol -= 16, outptr0 += 32, outptr1 += 32) {
++
++      p_lastcolsum_1l = _mm_alignr_epi8(thiscolsum_1l, lastcolsum_1h, 14);
++      p_lastcolsum_1h = _mm_alignr_epi8(thiscolsum_1h, thiscolsum_1l, 14);
++      p_lastcolsum1l = _mm_alignr_epi8(thiscolsum1l, lastcolsum1h, 14);
++      p_lastcolsum1h = _mm_alignr_epi8(thiscolsum1h, thiscolsum1l, 14);
++      lastcolsum_1h = thiscolsum_1h;
++      lastcolsum1h = thiscolsum1h;
++
++      if (__builtin_expect(incol <= 16, 0)) {
++        p_nextcolsum_1l = _mm_alignr_epi8(thiscolsum_1h, thiscolsum_1l, 2);
++        p_nextcolsum_1h = _mm_shuffle_epi8(thiscolsum_1h, next_index_lastcol);
++        p_nextcolsum1l = _mm_alignr_epi8(thiscolsum1h, thiscolsum1l, 2);
++        p_nextcolsum1h = _mm_shuffle_epi8(thiscolsum1h, next_index_lastcol);
++      } else {
++#ifdef NEED_ALIGN8
++        ALIGN8_READ16(this0, src0, 1) src0_ptr += 2;
++        ALIGN8_READ16(this_1, src_1, 1) src_1_ptr += 2;
++        ALIGN8_READ16(this1, src1, 1) src1_ptr += 2;
++#else
++        this0 = VEC_LD(inptr0 + 16); inptr0 += 16;
++        this_1 = VEC_LD(inptr_1 + 16); inptr_1 += 16;
++        this1 = VEC_LD(inptr1 + 16); inptr1 += 16;
++#endif
++        this0l = _mm_unpacklo_epi8(this0, pb_zero);
++        this0h = _mm_unpackhi_epi8(this0, pb_zero);
++        this0l = _mm_mullo_epi16(this0l, pw_three);
++        this0h = _mm_mullo_epi16(this0h, pw_three);
++
++        this_1l = _mm_unpacklo_epi8(this_1, pb_zero);
++        this_1h = _mm_unpackhi_epi8(this_1, pb_zero);
++        nextcolsum_1l = _mm_add_epi16(this0l, this_1l);
++        nextcolsum_1h = _mm_add_epi16(this0h, this_1h);
++        p_nextcolsum_1l = _mm_alignr_epi8(thiscolsum_1h, thiscolsum_1l, 2);
++        p_nextcolsum_1h = _mm_alignr_epi8(nextcolsum_1l, thiscolsum_1h, 2);
++
++        this1l = _mm_unpacklo_epi8(this1, pb_zero);
++        this1h = _mm_unpackhi_epi8(this1, pb_zero);
++        nextcolsum1l = _mm_add_epi16(this0l, this1l);
++        nextcolsum1h = _mm_add_epi16(this0h, this1h);
++        p_nextcolsum1l = _mm_alignr_epi8(thiscolsum1h, thiscolsum1l, 2);
++        p_nextcolsum1h = _mm_alignr_epi8(nextcolsum1l, thiscolsum1h, 2);
++      }
++
++      /* Process the upper row */
++
++      tmpl = _mm_mullo_epi16(thiscolsum_1l, pw_three);
++      outle = _mm_add_epi16(tmpl, p_lastcolsum_1l);
++      outle = _mm_add_epi16(outle, pw_eight);
++      outle = _mm_srli_epi16(outle, 4);
++
++      outlo = _mm_add_epi16(tmpl, p_nextcolsum_1l);
++      outlo = _mm_add_epi16(outlo, pw_seven);
++      outlo = _mm_srli_epi16(outlo, 4);
++
++      out = _mm_or_si128(outle, _mm_slli_epi16(outlo, 8));
++      VEC_ST(outptr0, out);
++
++      /* Process the lower row */
++
++      tmpl = _mm_mullo_epi16(thiscolsum1l, pw_three);
++      outle = _mm_add_epi16(tmpl, p_lastcolsum1l);
++      outle = _mm_add_epi16(outle, pw_eight);
++      outle = _mm_srli_epi16(outle, 4);
++
++      outlo = _mm_add_epi16(tmpl, p_nextcolsum1l);
++      outlo = _mm_add_epi16(outlo, pw_seven);
++      outlo = _mm_srli_epi16(outlo, 4);
++
++      out = _mm_or_si128(outle, _mm_slli_epi16(outlo, 8));
++      VEC_ST(outptr1, out);
++
++      if (__builtin_expect(incol <= 8, 0)) break;
++
++      tmph = _mm_mullo_epi16(thiscolsum_1h, pw_three);
++      outhe = _mm_add_epi16(tmph, p_lastcolsum_1h);
++      outhe = _mm_add_epi16(outhe, pw_eight);
++      outhe = _mm_srli_epi16(outhe, 4);
++
++      outho = _mm_add_epi16(tmph, p_nextcolsum_1h);
++      outho = _mm_add_epi16(outho, pw_seven);
++      outho = _mm_srli_epi16(outho, 4);
++
++      out = _mm_or_si128(outhe, _mm_slli_epi16(outho, 8));
++      VEC_ST(outptr0 + 16, out);
++
++      tmph = _mm_mullo_epi16(thiscolsum1h, pw_three);
++      outhe = _mm_add_epi16(tmph, p_lastcolsum1h);
++      outhe = _mm_add_epi16(outhe, pw_eight);
++      outhe = _mm_srli_epi16(outhe, 4);
++
++      outho = _mm_add_epi16(tmph, p_nextcolsum1h);
++      outho = _mm_add_epi16(outho, pw_seven);
++      outho = _mm_srli_epi16(outho, 4);
++
++      out = _mm_or_si128(outhe, _mm_slli_epi16(outho, 8));
++      VEC_ST(outptr1 + 16, out);
++
++      thiscolsum_1l = nextcolsum_1l;  thiscolsum_1h = nextcolsum_1h;
++      thiscolsum1l = nextcolsum1l;  thiscolsum1h = nextcolsum1h;
++    }
++  }
++}
++
++
++/* These are rarely used (mainly just for decompressing YCCK images) */
++
++void jsimd_h2v1_upsample_e2k(int max_v_samp_factor,
++                             JDIMENSION out_width,
++                             JSAMPARRAY input_data,
++                             JSAMPARRAY *output_data_ptr)
++{
++  JSAMPARRAY output_data = *output_data_ptr;
++  JSAMPROW inptr, outptr;
++  int inrow, incol;
++
++  __m128i in, inl, inh;
++#ifdef NEED_ALIGN8
++  ALIGN8_COMMON
++  ALIGN8_VARS(src)
++#endif
++
++  if (out_width > 0)
++  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
++    inptr = input_data[inrow];
++    outptr = output_data[inrow];
++#ifdef NEED_ALIGN8
++    ALIGN8_START(inptr, src)
++#endif
++    PRAGMA_E2K("ivdep")
++    for (incol = out_width; incol > 0;
++         incol -= 32, outptr += 32) {
++#ifdef NEED_ALIGN8
++      ALIGN8_READ16(in, src, 0) src_ptr += 2;
++#else
++      in = VEC_LD(inptr); inptr += 16;
++#endif
++      inl = _mm_unpacklo_epi8(in, in);
++      inh = _mm_unpackhi_epi8(in, in);
++
++      VEC_ST(outptr, inl);
++      VEC_ST(outptr + 16, inh);
++    }
++  }
++}
++
++
++void jsimd_h2v2_upsample_e2k(int max_v_samp_factor,
++                             JDIMENSION out_width,
++                             JSAMPARRAY input_data,
++                             JSAMPARRAY *output_data_ptr)
++{
++  JSAMPARRAY output_data = *output_data_ptr;
++  JSAMPROW inptr, outptr0, outptr1;
++  int inrow, outrow, incol;
++
++  __m128i in, inl, inh;
++#ifdef NEED_ALIGN8
++  ALIGN8_COMMON
++  ALIGN8_VARS(src)
++#endif
++
++  if (out_width > 0)
++  for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
++    inptr = input_data[inrow];
++    outptr0 = output_data[outrow++];
++    outptr1 = output_data[outrow++];
++#ifdef NEED_ALIGN8
++    ALIGN8_START(inptr, src)
++#endif
++    PRAGMA_E2K("ivdep")
++    for (incol = out_width; incol > 0;
++         incol -= 32, outptr0 += 32, outptr1 += 32) {
++#ifdef NEED_ALIGN8
++      ALIGN8_READ16(in, src, 0) src_ptr += 2;
++#else
++      in = VEC_LD(inptr); inptr += 16;
++#endif
++      inl = _mm_unpacklo_epi8(in, in);
++      inh = _mm_unpackhi_epi8(in, in);
++
++      VEC_ST(outptr0, inl);
++      VEC_ST(outptr1, inl);
++      VEC_ST(outptr0 + 16, inh);
++      VEC_ST(outptr1 + 16, inh);
++    }
++  }
++}
+diff --git a/simd/e2k/jfdctflt-e2k.c b/simd/e2k/jfdctflt-e2k.c
+new file mode 100644
+index 0000000..e3c4d94
+--- /dev/null
++++ b/simd/e2k/jfdctflt-e2k.c
+@@ -0,0 +1,127 @@
++/*
++ * Elbrus optimizations for libjpeg-turbo
++ *
++ * Copyright (C) 2014, D. R. Commander.  All Rights Reserved.
++ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
++ *
++ * This software is provided 'as-is', without any express or implied
++ * warranty.  In no event will the authors be held liable for any damages
++ * arising from the use of this software.
++ *
++ * Permission is granted to anyone to use this software for any purpose,
++ * including commercial applications, and to alter it and redistribute it
++ * freely, subject to the following restrictions:
++ *
++ * 1. The origin of this software must not be misrepresented; you must not
++ *    claim that you wrote the original software. If you use this software
++ *    in a product, an acknowledgment in the product documentation would be
++ *    appreciated but is not required.
++ * 2. Altered source versions must be plainly marked as such, and must not be
++ *    misrepresented as being the original software.
++ * 3. This notice may not be removed or altered from any source distribution.
++ */
++
++/* FLOAT FORWARD DCT */
++
++#include "jsimd_e2k.h"
++
++#define DO_FDCT(in, out) { \
++  tmp0 = _mm_add_ps(in##0, in##7); \
++  tmp7 = _mm_sub_ps(in##0, in##7); \
++  tmp1 = _mm_add_ps(in##1, in##6); \
++  tmp6 = _mm_sub_ps(in##1, in##6); \
++  tmp2 = _mm_add_ps(in##2, in##5); \
++  tmp5 = _mm_sub_ps(in##2, in##5); \
++  tmp3 = _mm_add_ps(in##3, in##4); \
++  tmp4 = _mm_sub_ps(in##3, in##4); \
++  \
++  /* Even part */ \
++  \
++  tmp10 = _mm_add_ps(tmp0, tmp3); \
++  tmp13 = _mm_sub_ps(tmp0, tmp3); \
++  tmp11 = _mm_add_ps(tmp1, tmp2); \
++  tmp12 = _mm_sub_ps(tmp1, tmp2); \
++  \
++  out##0 = _mm_add_ps(tmp10, tmp11); \
++  out##4 = _mm_sub_ps(tmp10, tmp11); \
++  \
++  z1 = _mm_mul_ps(_mm_add_ps(tmp12, tmp13), pd_f0707); \
++  out##2 = _mm_add_ps(tmp13, z1); \
++  out##6 = _mm_sub_ps(tmp13, z1); \
++  \
++  /* Odd part */ \
++  \
++  tmp10 = _mm_add_ps(tmp4, tmp5); \
++  tmp11 = _mm_add_ps(tmp5, tmp6); \
++  tmp12 = _mm_add_ps(tmp6, tmp7); \
++  \
++  z5 = _mm_mul_ps(_mm_sub_ps(tmp10, tmp12), pd_f0382); \
++  z2 = _mm_add_ps(_mm_mul_ps(tmp10, pd_f0541), z5); \
++  z4 = _mm_add_ps(_mm_mul_ps(tmp12, pd_f1306), z5); \
++  z3 = _mm_mul_ps(tmp11, pd_f0707); \
++  \
++  z11 = _mm_add_ps(tmp7, z3); \
++  z13 = _mm_sub_ps(tmp7, z3); \
++  \
++  out##5 = _mm_add_ps(z13, z2); \
++  out##3 = _mm_sub_ps(z13, z2); \
++  out##1 = _mm_add_ps(z11, z4); \
++  out##7 = _mm_sub_ps(z11, z4); \
++}
++
++#define LOAD_DATA(a, b, c, d, l, i) \
++  l##a = _mm_loadu_ps(data + a * 8 + i); \
++  l##b = _mm_loadu_ps(data + b * 8 + i); \
++  l##c = _mm_loadu_ps(data + c * 8 + i); \
++  l##d = _mm_loadu_ps(data + d * 8 + i);
++
++#define STORE_DATA(a, b, c, d, l, i) \
++  _mm_storeu_ps(data + a * 8 + i, l##a); \
++  _mm_storeu_ps(data + b * 8 + i, l##b); \
++  _mm_storeu_ps(data + c * 8 + i, l##c); \
++  _mm_storeu_ps(data + d * 8 + i, l##d);
++
++
++void jsimd_fdct_float_e2k(FAST_FLOAT *data)
++{
++  __m128 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
++    tmp10, tmp11, tmp12, tmp13, z1, z2, z3, z4, z5, z11, z13;
++  __m128 l0, l1, l2, l3, l4, l5, l6, l7;
++  __m128 h0, h1, h2, h3, h4, h5, h6, h7;
++  __m128 x0, x1, x2, x3, x4, x5, x6, x7;
++  __m128 y0, y1, y2, y3, y4, y5, y6, y7;
++
++  /* Constants */
++  __m128 pd_f0382 = _mm_set1_ps(0.382683433f),
++    pd_f0541 = _mm_set1_ps(0.541196100f),
++    pd_f0707 = _mm_set1_ps(0.707106781f),
++    pd_f1306 = _mm_set1_ps(1.306562965f);
++
++  /* Pass 1: process columns */
++
++  LOAD_DATA(0, 1, 2, 3, x, 0)
++  LOAD_DATA(0, 1, 2, 3, y, 4)
++  TRANSPOSE_FLOAT(x0, x1, x2, x3, l0, l1, l2, l3)
++  TRANSPOSE_FLOAT(y0, y1, y2, y3, l4, l5, l6, l7)
++  DO_FDCT(l, l);
++
++  LOAD_DATA(4, 5, 6, 7, x, 0)
++  LOAD_DATA(4, 5, 6, 7, y, 4)
++  TRANSPOSE_FLOAT(x4, x5, x6, x7, h0, h1, h2, h3)
++  TRANSPOSE_FLOAT(y4, y5, y6, y7, h4, h5, h6, h7)
++  DO_FDCT(h, h);
++
++  /* Pass 2: process rows */
++
++  TRANSPOSE_FLOAT(l0, l1, l2, l3, x0, x1, x2, x3)
++  TRANSPOSE_FLOAT(h0, h1, h2, h3, x4, x5, x6, x7)
++  DO_FDCT(x, x);
++  STORE_DATA(0, 1, 2, 3, x, 0)
++  STORE_DATA(4, 5, 6, 7, x, 0)
++
++  TRANSPOSE_FLOAT(l4, l5, l6, l7, y0, y1, y2, y3)
++  TRANSPOSE_FLOAT(h4, h5, h6, h7, y4, y5, y6, y7)
++  DO_FDCT(y, y);
++  STORE_DATA(0, 1, 2, 3, y, 4)
++  STORE_DATA(4, 5, 6, 7, y, 4)
++}
+diff --git a/simd/e2k/jfdctfst-e2k.c b/simd/e2k/jfdctfst-e2k.c
+new file mode 100644
+index 0000000..9e58f05
+--- /dev/null
++++ b/simd/e2k/jfdctfst-e2k.c
+@@ -0,0 +1,145 @@
++/*
++ * Elbrus optimizations for libjpeg-turbo
++ *
++ * Copyright (C) 2014, D. R. Commander.  All Rights Reserved.
++ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
++ *
++ * This software is provided 'as-is', without any express or implied
++ * warranty.  In no event will the authors be held liable for any damages
++ * arising from the use of this software.
++ *
++ * Permission is granted to anyone to use this software for any purpose,
++ * including commercial applications, and to alter it and redistribute it
++ * freely, subject to the following restrictions:
++ *
++ * 1. The origin of this software must not be misrepresented; you must not
++ *    claim that you wrote the original software. If you use this software
++ *    in a product, an acknowledgment in the product documentation would be
++ *    appreciated but is not required.
++ * 2. Altered source versions must be plainly marked as such, and must not be
++ *    misrepresented as being the original software.
++ * 3. This notice may not be removed or altered from any source distribution.
++ */
++
++/* FAST INTEGER FORWARD DCT */
++
++#include "jsimd_e2k.h"
++
++
++#define F_0_382  98   /* FIX(0.382683433) */
++#define F_0_541  139  /* FIX(0.541196100) */
++#define F_0_707  181  /* FIX(0.707106781) */
++#define F_1_306  334  /* FIX(1.306562965) */
++
++#define CONST_BITS  8
++#define PRE_MULTIPLY_SCALE_BITS  2
++#define CONST_SHIFT  (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
++
++
++#define DO_FDCT() { \
++  /* Even part */ \
++  \
++  tmp10 = _mm_add_epi16(tmp0, tmp3); \
++  tmp13 = _mm_sub_epi16(tmp0, tmp3); \
++  tmp11 = _mm_add_epi16(tmp1, tmp2); \
++  tmp12 = _mm_sub_epi16(tmp1, tmp2); \
++  \
++  out0  = _mm_add_epi16(tmp10, tmp11); \
++  out4  = _mm_sub_epi16(tmp10, tmp11); \
++  \
++  z1 = _mm_add_epi16(tmp12, tmp13); \
++  z1 = _mm_slli_epi16(z1, PRE_MULTIPLY_SCALE_BITS); \
++  z1 = _mm_mulhi_epi16(z1, pw_0707); \
++  \
++  out2 = _mm_add_epi16(tmp13, z1); \
++  out6 = _mm_sub_epi16(tmp13, z1); \
++  \
++  /* Odd part */ \
++  \
++  tmp10 = _mm_add_epi16(tmp4, tmp5); \
++  tmp11 = _mm_add_epi16(tmp5, tmp6); \
++  tmp12 = _mm_add_epi16(tmp6, tmp7); \
++  \
++  tmp10 = _mm_slli_epi16(tmp10, PRE_MULTIPLY_SCALE_BITS); \
++  tmp12 = _mm_slli_epi16(tmp12, PRE_MULTIPLY_SCALE_BITS); \
++  z5 = _mm_sub_epi16(tmp10, tmp12); \
++  z5 = _mm_mulhi_epi16(z5, pw_0382); \
++  \
++  z2 = _mm_add_epi16(_mm_mulhi_epi16(tmp10, pw_0541), z5); \
++  z4 = _mm_add_epi16(_mm_mulhi_epi16(tmp12, pw_1306), z5); \
++  \
++  tmp11 = _mm_slli_epi16(tmp11, PRE_MULTIPLY_SCALE_BITS); \
++  z3 = _mm_mulhi_epi16(tmp11, pw_0707); \
++  \
++  z11 = _mm_add_epi16(tmp7, z3); \
++  z13 = _mm_sub_epi16(tmp7, z3); \
++  \
++  out5 = _mm_add_epi16(z13, z2); \
++  out3 = _mm_sub_epi16(z13, z2); \
++  out1 = _mm_add_epi16(z11, z4); \
++  out7 = _mm_sub_epi16(z11, z4); \
++}
++
++
++void jsimd_fdct_ifast_e2k(DCTELEM *data)
++{
++  __m128i row0, row1, row2, row3, row4, row5, row6, row7,
++    col0, col1, col2, col3, col4, col5, col6, col7,
++    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
++    z1, z2, z3, z4, z5, z11, z13,
++    out0, out1, out2, out3, out4, out5, out6, out7;
++
++  /* Constants */
++  __m128i pw_0382 = _mm_set1_epi16(F_0_382 << CONST_SHIFT),
++    pw_0541 = _mm_set1_epi16(F_0_541 << CONST_SHIFT),
++    pw_0707 = _mm_set1_epi16(F_0_707 << CONST_SHIFT),
++    pw_1306 = _mm_set1_epi16(F_1_306 << CONST_SHIFT);
++
++  /* Pass 1: process rows */
++
++  row0 = VEC_LD(data + 0 * 8);
++  row1 = VEC_LD(data + 1 * 8);
++  row2 = VEC_LD(data + 2 * 8);
++  row3 = VEC_LD(data + 3 * 8);
++  row4 = VEC_LD(data + 4 * 8);
++  row5 = VEC_LD(data + 5 * 8);
++  row6 = VEC_LD(data + 6 * 8);
++  row7 = VEC_LD(data + 7 * 8);
++
++  TRANSPOSE(row, col);
++
++  tmp0 = _mm_add_epi16(col0, col7);
++  tmp7 = _mm_sub_epi16(col0, col7);
++  tmp1 = _mm_add_epi16(col1, col6);
++  tmp6 = _mm_sub_epi16(col1, col6);
++  tmp2 = _mm_add_epi16(col2, col5);
++  tmp5 = _mm_sub_epi16(col2, col5);
++  tmp3 = _mm_add_epi16(col3, col4);
++  tmp4 = _mm_sub_epi16(col3, col4);
++
++  DO_FDCT();
++
++  /* Pass 2: process columns */
++
++  TRANSPOSE(out, row);
++
++  tmp0 = _mm_add_epi16(row0, row7);
++  tmp7 = _mm_sub_epi16(row0, row7);
++  tmp1 = _mm_add_epi16(row1, row6);
++  tmp6 = _mm_sub_epi16(row1, row6);
++  tmp2 = _mm_add_epi16(row2, row5);
++  tmp5 = _mm_sub_epi16(row2, row5);
++  tmp3 = _mm_add_epi16(row3, row4);
++  tmp4 = _mm_sub_epi16(row3, row4);
++
++  DO_FDCT();
++
++  VEC_ST(data + 0 * 8, out0);
++  VEC_ST(data + 1 * 8, out1);
++  VEC_ST(data + 2 * 8, out2);
++  VEC_ST(data + 3 * 8, out3);
++  VEC_ST(data + 4 * 8, out4);
++  VEC_ST(data + 5 * 8, out5);
++  VEC_ST(data + 6 * 8, out6);
++  VEC_ST(data + 7 * 8, out7);
++}
+diff --git a/simd/e2k/jfdctint-e2k.c b/simd/e2k/jfdctint-e2k.c
+new file mode 100644
+index 0000000..2200852
+--- /dev/null
++++ b/simd/e2k/jfdctint-e2k.c
+@@ -0,0 +1,255 @@
++/*
++ * Elbrus optimizations for libjpeg-turbo
++ *
++ * Copyright (C) 2014, 2020, D. R. Commander.  All Rights Reserved.
++ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
++ *
++ * This software is provided 'as-is', without any express or implied
++ * warranty.  In no event will the authors be held liable for any damages
++ * arising from the use of this software.
++ *
++ * Permission is granted to anyone to use this software for any purpose,
++ * including commercial applications, and to alter it and redistribute it
++ * freely, subject to the following restrictions:
++ *
++ * 1. The origin of this software must not be misrepresented; you must not
++ *    claim that you wrote the original software. If you use this software
++ *    in a product, an acknowledgment in the product documentation would be
++ *    appreciated but is not required.
++ * 2. Altered source versions must be plainly marked as such, and must not be
++ *    misrepresented as being the original software.
++ * 3. This notice may not be removed or altered from any source distribution.
++ */
++
++/* ACCURATE INTEGER FORWARD DCT */
++
++#include "jsimd_e2k.h"
++
++
++#define F_0_298  2446   /* FIX(0.298631336) */
++#define F_0_390  3196   /* FIX(0.390180644) */
++#define F_0_541  4433   /* FIX(0.541196100) */
++#define F_0_765  6270   /* FIX(0.765366865) */
++#define F_0_899  7373   /* FIX(0.899976223) */
++#define F_1_175  9633   /* FIX(1.175875602) */
++#define F_1_501  12299  /* FIX(1.501321110) */
++#define F_1_847  15137  /* FIX(1.847759065) */
++#define F_1_961  16069  /* FIX(1.961570560) */
++#define F_2_053  16819  /* FIX(2.053119869) */
++#define F_2_562  20995  /* FIX(2.562915447) */
++#define F_3_072  25172  /* FIX(3.072711026) */
++
++#define CONST_BITS  13
++#define PASS1_BITS  2
++#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
++#define DESCALE_P2  (CONST_BITS + PASS1_BITS)
++
++
++#define DO_FDCT_COMMON(PASS) { \
++  /* (Original) \
++   * z1 = (tmp12 + tmp13) * 0.541196100; \
++   * data2 = z1 + tmp13 * 0.765366865; \
++   * data6 = z1 + tmp12 * -1.847759065; \
++   * \
++   * (This implementation) \
++   * data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; \
++   * data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); \
++   */ \
++  \
++  tmp1312l = _mm_unpacklo_epi16(tmp13, tmp12); \
++  tmp1312h = _mm_unpackhi_epi16(tmp13, tmp12); \
++  \
++  out2l = _mm_add_epi32(_mm_madd_epi16(tmp1312l, pw_f130_f054), pd_descale_p##PASS); \
++  out2h = _mm_add_epi32(_mm_madd_epi16(tmp1312h, pw_f130_f054), pd_descale_p##PASS); \
++  out6l = _mm_add_epi32(_mm_madd_epi16(tmp1312l, pw_f054_mf130), pd_descale_p##PASS); \
++  out6h = _mm_add_epi32(_mm_madd_epi16(tmp1312h, pw_f054_mf130), pd_descale_p##PASS); \
++  \
++  out2l = _mm_srai_epi32(out2l, DESCALE_P##PASS); \
++  out2h = _mm_srai_epi32(out2h, DESCALE_P##PASS); \
++  out6l = _mm_srai_epi32(out6l, DESCALE_P##PASS); \
++  out6h = _mm_srai_epi32(out6h, DESCALE_P##PASS); \
++  \
++  out2 = _mm_packs_epi32(out2l, out2h); \
++  out6 = _mm_packs_epi32(out6l, out6h); \
++  \
++  /* Odd part */ \
++  \
++  z3 = _mm_add_epi16(tmp4, tmp6); \
++  z4 = _mm_add_epi16(tmp5, tmp7); \
++  \
++  /* (Original) \
++   * z5 = (z3 + z4) * 1.175875602; \
++   * z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644; \
++   * z3 += z5;  z4 += z5; \
++   * \
++   * (This implementation) \
++   * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
++   * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
++   */ \
++  \
++  z34l = _mm_unpacklo_epi16(z3, z4); \
++  z34h = _mm_unpackhi_epi16(z3, z4); \
++  \
++  z3l = _mm_add_epi32(_mm_madd_epi16(z34l, pw_mf078_f117), pd_descale_p##PASS); \
++  z3h = _mm_add_epi32(_mm_madd_epi16(z34h, pw_mf078_f117), pd_descale_p##PASS); \
++  z4l = _mm_add_epi32(_mm_madd_epi16(z34l, pw_f117_f078), pd_descale_p##PASS); \
++  z4h = _mm_add_epi32(_mm_madd_epi16(z34h, pw_f117_f078), pd_descale_p##PASS); \
++  \
++  /* (Original) \
++   * z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6; \
++   * tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869; \
++   * tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110; \
++   * z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447; \
++   * data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4; \
++   * data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4; \
++   * \
++   * (This implementation) \
++   * tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; \
++   * tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; \
++   * tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); \
++   * tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); \
++   * data7 = tmp4 + z3;  data5 = tmp5 + z4; \
++   * data3 = tmp6 + z3;  data1 = tmp7 + z4; \
++   */ \
++  \
++  tmp47l = _mm_unpacklo_epi16(tmp4, tmp7); \
++  tmp47h = _mm_unpackhi_epi16(tmp4, tmp7); \
++  \
++  out7l = _mm_add_epi32(_mm_madd_epi16(tmp47l, pw_mf060_mf089), z3l); \
++  out7h = _mm_add_epi32(_mm_madd_epi16(tmp47h, pw_mf060_mf089), z3h); \
++  out1l = _mm_add_epi32(_mm_madd_epi16(tmp47l, pw_mf089_f060), z4l); \
++  out1h = _mm_add_epi32(_mm_madd_epi16(tmp47h, pw_mf089_f060), z4h); \
++  \
++  out7l = _mm_srai_epi32(out7l, DESCALE_P##PASS); \
++  out7h = _mm_srai_epi32(out7h, DESCALE_P##PASS); \
++  out1l = _mm_srai_epi32(out1l, DESCALE_P##PASS); \
++  out1h = _mm_srai_epi32(out1h, DESCALE_P##PASS); \
++  \
++  out7 = _mm_packs_epi32(out7l, out7h); \
++  out1 = _mm_packs_epi32(out1l, out1h); \
++  \
++  tmp56l = _mm_unpacklo_epi16(tmp5, tmp6); \
++  tmp56h = _mm_unpackhi_epi16(tmp5, tmp6); \
++  \
++  out5l = _mm_add_epi32(_mm_madd_epi16(tmp56l, pw_mf050_mf256), z4l); \
++  out5h = _mm_add_epi32(_mm_madd_epi16(tmp56h, pw_mf050_mf256), z4h); \
++  out3l = _mm_add_epi32(_mm_madd_epi16(tmp56l, pw_mf256_f050), z3l); \
++  out3h = _mm_add_epi32(_mm_madd_epi16(tmp56h, pw_mf256_f050), z3h); \
++  \
++  out5l = _mm_srai_epi32(out5l, DESCALE_P##PASS); \
++  out5h = _mm_srai_epi32(out5h, DESCALE_P##PASS); \
++  out3l = _mm_srai_epi32(out3l, DESCALE_P##PASS); \
++  out3h = _mm_srai_epi32(out3h, DESCALE_P##PASS); \
++  \
++  out5 = _mm_packs_epi32(out5l, out5h); \
++  out3 = _mm_packs_epi32(out3l, out3h); \
++}
++
++#define DO_FDCT_PASS1() { \
++  /* Even part */ \
++  \
++  tmp10 = _mm_add_epi16(tmp0, tmp3); \
++  tmp13 = _mm_sub_epi16(tmp0, tmp3); \
++  tmp11 = _mm_add_epi16(tmp1, tmp2); \
++  tmp12 = _mm_sub_epi16(tmp1, tmp2); \
++  \
++  out0  = _mm_add_epi16(tmp10, tmp11); \
++  out0  = _mm_slli_epi16(out0, PASS1_BITS); \
++  out4  = _mm_sub_epi16(tmp10, tmp11); \
++  out4  = _mm_slli_epi16(out4, PASS1_BITS); \
++  \
++  DO_FDCT_COMMON(1); \
++}
++
++#define DO_FDCT_PASS2() { \
++  /* Even part */ \
++  \
++  tmp10 = _mm_add_epi16(tmp0, tmp3); \
++  tmp13 = _mm_sub_epi16(tmp0, tmp3); \
++  tmp11 = _mm_add_epi16(tmp1, tmp2); \
++  tmp12 = _mm_sub_epi16(tmp1, tmp2); \
++  \
++  out0  = _mm_add_epi16(tmp10, tmp11); \
++  out0  = _mm_add_epi16(out0, pw_descale_p2x); \
++  out0  = _mm_srai_epi16(out0, PASS1_BITS); \
++  out4  = _mm_sub_epi16(tmp10, tmp11); \
++  out4  = _mm_add_epi16(out4, pw_descale_p2x); \
++  out4  = _mm_srai_epi16(out4, PASS1_BITS); \
++  \
++  DO_FDCT_COMMON(2); \
++}
++
++
++void jsimd_fdct_islow_e2k(DCTELEM *data)
++{
++  __m128i row0, row1, row2, row3, row4, row5, row6, row7,
++    col0, col1, col2, col3, col4, col5, col6, col7,
++    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
++    tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h,
++    z3, z4, z34l, z34h,
++    out0, out1, out2, out3, out4, out5, out6, out7;
++  __m128i z3l, z3h, z4l, z4h,
++    out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h,
++    out7l, out7h;
++
++  /* Constants */
++  __m128i pw_f130_f054 = _mm_setr_epi16(__4X2(F_0_541 + F_0_765, F_0_541)),
++    pw_f054_mf130 = _mm_setr_epi16(__4X2(F_0_541, F_0_541 - F_1_847)),
++    pw_mf078_f117 = _mm_setr_epi16(__4X2(F_1_175 - F_1_961, F_1_175)),
++    pw_f117_f078 = _mm_setr_epi16(__4X2(F_1_175, F_1_175 - F_0_390)),
++    pw_mf060_mf089 = _mm_setr_epi16(__4X2(F_0_298 - F_0_899, -F_0_899)),
++    pw_mf089_f060 = _mm_setr_epi16(__4X2(-F_0_899, F_1_501 - F_0_899)),
++    pw_mf050_mf256 = _mm_setr_epi16(__4X2(F_2_053 - F_2_562, -F_2_562)),
++    pw_mf256_f050 = _mm_setr_epi16(__4X2(-F_2_562, F_3_072 - F_2_562)),
++    pw_descale_p2x = _mm_set1_epi16(1 << (PASS1_BITS - 1)),
++    pd_descale_p1 = _mm_set1_epi32(1 << (DESCALE_P1 - 1)),
++    pd_descale_p2 = _mm_set1_epi32(1 << (DESCALE_P2 - 1));
++
++  /* Pass 1: process rows */
++
++  row0 = VEC_LD(data + 0 * 8);
++  row1 = VEC_LD(data + 1 * 8);
++  row2 = VEC_LD(data + 2 * 8);
++  row3 = VEC_LD(data + 3 * 8);
++  row4 = VEC_LD(data + 4 * 8);
++  row5 = VEC_LD(data + 5 * 8);
++  row6 = VEC_LD(data + 6 * 8);
++  row7 = VEC_LD(data + 7 * 8);
++
++  TRANSPOSE(row, col);
++
++  tmp0 = _mm_add_epi16(col0, col7);
++  tmp7 = _mm_sub_epi16(col0, col7);
++  tmp1 = _mm_add_epi16(col1, col6);
++  tmp6 = _mm_sub_epi16(col1, col6);
++  tmp2 = _mm_add_epi16(col2, col5);
++  tmp5 = _mm_sub_epi16(col2, col5);
++  tmp3 = _mm_add_epi16(col3, col4);
++  tmp4 = _mm_sub_epi16(col3, col4);
++
++  DO_FDCT_PASS1();
++
++  /* Pass 2: process columns */
++
++  TRANSPOSE(out, row);
++
++  tmp0 = _mm_add_epi16(row0, row7);
++  tmp7 = _mm_sub_epi16(row0, row7);
++  tmp1 = _mm_add_epi16(row1, row6);
++  tmp6 = _mm_sub_epi16(row1, row6);
++  tmp2 = _mm_add_epi16(row2, row5);
++  tmp5 = _mm_sub_epi16(row2, row5);
++  tmp3 = _mm_add_epi16(row3, row4);
++  tmp4 = _mm_sub_epi16(row3, row4);
++
++  DO_FDCT_PASS2();
++
++  VEC_ST(data + 0 * 8, out0);
++  VEC_ST(data + 1 * 8, out1);
++  VEC_ST(data + 2 * 8, out2);
++  VEC_ST(data + 3 * 8, out3);
++  VEC_ST(data + 4 * 8, out4);
++  VEC_ST(data + 5 * 8, out5);
++  VEC_ST(data + 6 * 8, out6);
++  VEC_ST(data + 7 * 8, out7);
++}
+diff --git a/simd/e2k/jidctflt-e2k.c b/simd/e2k/jidctflt-e2k.c
+new file mode 100644
+index 0000000..7682965
+--- /dev/null
++++ b/simd/e2k/jidctflt-e2k.c
+@@ -0,0 +1,215 @@
++/*
++ * Elbrus optimizations for libjpeg-turbo
++ *
++ * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
++ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
++ *
++ * This software is provided 'as-is', without any express or implied
++ * warranty.  In no event will the authors be held liable for any damages
++ * arising from the use of this software.
++ *
++ * Permission is granted to anyone to use this software for any purpose,
++ * including commercial applications, and to alter it and redistribute it
++ * freely, subject to the following restrictions:
++ *
++ * 1. The origin of this software must not be misrepresented; you must not
++ *    claim that you wrote the original software. If you use this software
++ *    in a product, an acknowledgment in the product documentation would be
++ *    appreciated but is not required.
++ * 2. Altered source versions must be plainly marked as such, and must not be
++ *    misrepresented as being the original software.
++ * 3. This notice may not be removed or altered from any source distribution.
++ */
++
++/* FLOAT INVERSE DCT */
++
++#include "jsimd_e2k.h"
++
++#define DO_IDCT(in, out) { \
++  /* Even part */ \
++  \
++  tmp10 = _mm_add_ps(in##0, in##4); \
++  tmp11 = _mm_sub_ps(in##0, in##4); \
++  \
++  tmp13 = _mm_add_ps(in##2, in##6); \
++  tmp12 = _mm_sub_ps(in##2, in##6); \
++  tmp12 = _mm_sub_ps(_mm_mul_ps(tmp12, pd_f1414), tmp13); \
++  \
++  tmp0 = _mm_add_ps(tmp10, tmp13); \
++  tmp3 = _mm_sub_ps(tmp10, tmp13); \
++  tmp1 = _mm_add_ps(tmp11, tmp12); \
++  tmp2 = _mm_sub_ps(tmp11, tmp12); \
++  \
++  /* Odd part */ \
++  \
++  z13 = _mm_add_ps(in##5, in##3); \
++  z10 = _mm_sub_ps(in##5, in##3); \
++  z11 = _mm_add_ps(in##1, in##7); \
++  z12 = _mm_sub_ps(in##1, in##7); \
++  \
++  tmp7 = _mm_add_ps(z11, z13); \
++  tmp11 = _mm_sub_ps(z11, z13); \
++  tmp11 = _mm_mul_ps(tmp11, pd_f1414); \
++  \
++  z5 = _mm_mul_ps(_mm_add_ps(z10, z12), pd_f1847); \
++  tmp10 = _mm_sub_ps(z5, _mm_mul_ps(z12, pd_f1082)); \
++  tmp12 = _mm_sub_ps(z5, _mm_mul_ps(z10, pd_f2613)); \
++  \
++  tmp6 = _mm_sub_ps(tmp12, tmp7); \
++  tmp5 = _mm_sub_ps(tmp11, tmp6); \
++  tmp4 = _mm_sub_ps(tmp10, tmp5); \
++  \
++  out##0 = _mm_add_ps(tmp0, tmp7); \
++  out##7 = _mm_sub_ps(tmp0, tmp7); \
++  out##1 = _mm_add_ps(tmp1, tmp6); \
++  out##6 = _mm_sub_ps(tmp1, tmp6); \
++  out##2 = _mm_add_ps(tmp2, tmp5); \
++  out##5 = _mm_sub_ps(tmp2, tmp5); \
++  out##3 = _mm_add_ps(tmp3, tmp4); \
++  out##4 = _mm_sub_ps(tmp3, tmp4); \
++}
++
++#define QUANT_MUL(a, b, c, d, l, lo, i) \
++  out0 = _mm_srai_epi32(_mm_unpack##lo##_epi16(col##a, col##a), 16); \
++  out1 = _mm_srai_epi32(_mm_unpack##lo##_epi16(col##b, col##b), 16); \
++  out2 = _mm_srai_epi32(_mm_unpack##lo##_epi16(col##c, col##c), 16); \
++  out3 = _mm_srai_epi32(_mm_unpack##lo##_epi16(col##d, col##d), 16); \
++  l##a = _mm_cvtepi32_ps(out0); \
++  l##b = _mm_cvtepi32_ps(out1); \
++  l##c = _mm_cvtepi32_ps(out2); \
++  l##d = _mm_cvtepi32_ps(out3); \
++  l##a = _mm_mul_ps(l##a, _mm_load_ps(dct_table + a * 8 + i)); \
++  l##b = _mm_mul_ps(l##b, _mm_load_ps(dct_table + b * 8 + i)); \
++  l##c = _mm_mul_ps(l##c, _mm_load_ps(dct_table + c * 8 + i)); \
++  l##d = _mm_mul_ps(l##d, _mm_load_ps(dct_table + d * 8 + i));
++
++
++void jsimd_idct_float_e2k(void *dct_table_, JCOEFPTR coef_block,
++                          JSAMPARRAY output_buf, JDIMENSION output_col)
++{
++  float *dct_table = (float *)dct_table_;
++
++  __m128i col0, col1, col2, col3, col4, col5, col6, col7,
++    out0, out1, out2, out3, out4, out5, out6, out7, row0, row1, row2, row3;
++  __m128 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
++    tmp10, tmp11, tmp12, tmp13, z5, z10, z11, z12, z13;
++  __m128 l0, l1, l2, l3, l4, l5, l6, l7;
++  __m128 h0, h1, h2, h3, h4, h5, h6, h7;
++  __m128 x0, x1, x2, x3, x4, x5, x6, x7;
++  __m128 y0, y1, y2, y3, y4, y5, y6, y7;
++
++  /* Constants */
++  __m128 pd_f1414 = _mm_set1_ps(1.414213562f),
++    pd_f1847 = _mm_set1_ps(1.847759065f),
++    pd_f1082 = _mm_set1_ps(1.082392200f),
++    pd_f2613 = _mm_set1_ps(2.613125930f);
++
++  /* Pass 1: process columns */
++
++  col0 = VEC_LD(coef_block + 0 * 8);
++  col1 = VEC_LD(coef_block + 1 * 8);
++  col2 = VEC_LD(coef_block + 2 * 8);
++  col3 = VEC_LD(coef_block + 3 * 8);
++  col4 = VEC_LD(coef_block + 4 * 8);
++  col5 = VEC_LD(coef_block + 5 * 8);
++  col6 = VEC_LD(coef_block + 6 * 8);
++  col7 = VEC_LD(coef_block + 7 * 8);
++
++  out1 = _mm_or_si128(col1, col2);
++  out2 = _mm_or_si128(col3, col4);
++  out1 = _mm_or_si128(out1, out2);
++  out3 = _mm_or_si128(col5, col6);
++  out3 = _mm_or_si128(out3, col7);
++  out1 = _mm_or_si128(out1, out3);
++
++  if (VEC_ISZERO(out1)) {
++    /* AC terms all zero */
++
++    out0 = _mm_srai_epi32(_mm_unpacklo_epi16(col0, col0), 16);
++    out1 = _mm_srai_epi32(_mm_unpackhi_epi16(col0, col0), 16);
++    tmp0 = _mm_cvtepi32_ps(out0);
++    tmp1 = _mm_cvtepi32_ps(out1);
++    tmp0 = _mm_mul_ps(tmp0, _mm_load_ps(dct_table));
++    tmp1 = _mm_mul_ps(tmp1, _mm_load_ps(dct_table + 4));
++
++    l0 = h0 = _mm_shuffle_ps(tmp0, tmp0, 0x00);
++    l1 = h1 = _mm_shuffle_ps(tmp0, tmp0, 0x55);
++    l2 = h2 = _mm_shuffle_ps(tmp0, tmp0, 0xaa);
++    l3 = h3 = _mm_shuffle_ps(tmp0, tmp0, 0xff);
++    l4 = h4 = _mm_shuffle_ps(tmp1, tmp1, 0x00);
++    l5 = h5 = _mm_shuffle_ps(tmp1, tmp1, 0x55);
++    l6 = h6 = _mm_shuffle_ps(tmp1, tmp1, 0xaa);
++    l7 = h7 = _mm_shuffle_ps(tmp1, tmp1, 0xff);
++
++  } else {
++
++    QUANT_MUL(0, 2, 4, 6, l, lo, 0)
++    QUANT_MUL(1, 3, 5, 7, l, lo, 0)
++    DO_IDCT(l, x);
++
++    QUANT_MUL(0, 2, 4, 6, h, hi, 4)
++    QUANT_MUL(1, 3, 5, 7, h, hi, 4)
++    DO_IDCT(h, y);
++
++    TRANSPOSE_FLOAT(x0, x1, x2, x3, l0, l1, l2, l3)
++    TRANSPOSE_FLOAT(x4, x5, x6, x7, h0, h1, h2, h3)
++    TRANSPOSE_FLOAT(y0, y1, y2, y3, l4, l5, l6, l7)
++    TRANSPOSE_FLOAT(y4, y5, y6, y7, h4, h5, h6, h7)
++  }
++
++  /* Pass 2: process rows */
++
++  DO_IDCT(l, x);
++  DO_IDCT(h, y);
++
++#ifdef JSIMD_SAME_ROUNDING
++#define OUT_ROUND(i) \
++  tmp0 = _mm_add_ps(_mm_mul_ps(x##i, pd_f0125), pd_cj_rnd); \
++  tmp1 = _mm_add_ps(_mm_mul_ps(y##i, pd_f0125), pd_cj_rnd); \
++  out##i = _mm_packs_epi32(_mm_cvttps_epi32(tmp0), _mm_cvttps_epi32(tmp1));
++
++  {
++    __m128 pd_cj_rnd = _mm_set1_ps(0.5f + CENTERJSAMPLE),
++      pd_f0125 = _mm_set1_ps(0.125f);
++
++    OUT_ROUND(0) OUT_ROUND(1)
++    OUT_ROUND(2) OUT_ROUND(3)
++    OUT_ROUND(4) OUT_ROUND(5)
++    OUT_ROUND(6) OUT_ROUND(7)
++  }
++  row0 = _mm_packus_epi16(out0, out1);
++  row1 = _mm_packus_epi16(out2, out3);
++  row2 = _mm_packus_epi16(out4, out5);
++  row3 = _mm_packus_epi16(out6, out7);
++
++  TRANSPOSE8(row, col) TRANSPOSE8(col, row) TRANSPOSE8(row, col)
++#else  /* faster, slightly differ in rounding */
++#define OUT_ROUND(i, a, b) out##i = _mm_blendv_epi8( \
++  _mm_slli_epi32(_mm_castps_si128(_mm_add_ps(b, pd_round)), 16), \
++  _mm_castps_si128(_mm_add_ps(a, pd_round)), pd_mask);
++
++  {
++    __m128i pd_mask = _mm_set1_epi32(0xffff);
++    __m128 pd_round = _mm_set1_ps((3 << 22 | CENTERJSAMPLE) * 8);
++
++    OUT_ROUND(0, x0, x4) OUT_ROUND(1, y0, y4)
++    OUT_ROUND(2, x1, x5) OUT_ROUND(3, y1, y5)
++    OUT_ROUND(4, x2, x6) OUT_ROUND(5, y2, y6)
++    OUT_ROUND(6, x3, x7) OUT_ROUND(7, y3, y7)
++  }
++  row0 = _mm_packus_epi16(out0, out1);
++  row1 = _mm_packus_epi16(out2, out3);
++  row2 = _mm_packus_epi16(out4, out5);
++  row3 = _mm_packus_epi16(out6, out7);
++
++  TRANSPOSE8(row, out) TRANSPOSE8(out, col)
++#endif
++  VEC_STL(output_buf[0] + output_col, col0);
++  VEC_STH(output_buf[1] + output_col, col0);
++  VEC_STL(output_buf[2] + output_col, col1);
++  VEC_STH(output_buf[3] + output_col, col1);
++  VEC_STL(output_buf[4] + output_col, col2);
++  VEC_STH(output_buf[5] + output_col, col2);
++  VEC_STL(output_buf[6] + output_col, col3);
++  VEC_STH(output_buf[7] + output_col, col3);
++}
+diff --git a/simd/e2k/jidctfst-e2k.c b/simd/e2k/jidctfst-e2k.c
+new file mode 100644
+index 0000000..18bc425
+--- /dev/null
++++ b/simd/e2k/jidctfst-e2k.c
+@@ -0,0 +1,187 @@
++/*
++ * Elbrus optimizations for libjpeg-turbo
++ *
++ * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
++ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
++ *
++ * This software is provided 'as-is', without any express or implied
++ * warranty.  In no event will the authors be held liable for any damages
++ * arising from the use of this software.
++ *
++ * Permission is granted to anyone to use this software for any purpose,
++ * including commercial applications, and to alter it and redistribute it
++ * freely, subject to the following restrictions:
++ *
++ * 1. The origin of this software must not be misrepresented; you must not
++ *    claim that you wrote the original software. If you use this software
++ *    in a product, an acknowledgment in the product documentation would be
++ *    appreciated but is not required.
++ * 2. Altered source versions must be plainly marked as such, and must not be
++ *    misrepresented as being the original software.
++ * 3. This notice may not be removed or altered from any source distribution.
++ */
++
++/* FAST INTEGER INVERSE DCT */
++
++#include "jsimd_e2k.h"
++
++
++#define F_1_082  277              /* FIX(1.082392200) */
++#define F_1_414  362              /* FIX(1.414213562) */
++#define F_1_847  473              /* FIX(1.847759065) */
++#define F_2_613  669              /* FIX(2.613125930) */
++#define F_1_613  (F_2_613 - 256)  /* FIX(2.613125930) - FIX(1) */
++
++#define CONST_BITS  8
++#define PASS1_BITS  2
++#define PRE_MULTIPLY_SCALE_BITS  2
++#define CONST_SHIFT  (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
++
++
++#define DO_IDCT(in) { \
++  /* Even part */ \
++  \
++  tmp10 = _mm_add_epi16(in##0, in##4); \
++  tmp11 = _mm_sub_epi16(in##0, in##4); \
++  tmp13 = _mm_add_epi16(in##2, in##6); \
++  \
++  tmp12 = _mm_sub_epi16(in##2, in##6); \
++  tmp12 = _mm_slli_epi16(tmp12, PRE_MULTIPLY_SCALE_BITS); \
++  tmp12 = _mm_mulhi_epi16(tmp12, pw_F1414); \
++  tmp12 = _mm_sub_epi16(tmp12, tmp13); \
++  \
++  tmp0 = _mm_add_epi16(tmp10, tmp13); \
++  tmp3 = _mm_sub_epi16(tmp10, tmp13); \
++  tmp1 = _mm_add_epi16(tmp11, tmp12); \
++  tmp2 = _mm_sub_epi16(tmp11, tmp12); \
++  \
++  /* Odd part */ \
++  \
++  z13 = _mm_add_epi16(in##5, in##3); \
++  z10 = _mm_sub_epi16(in##5, in##3); \
++  z10s = _mm_slli_epi16(z10, PRE_MULTIPLY_SCALE_BITS); \
++  z11 = _mm_add_epi16(in##1, in##7); \
++  z12s = _mm_sub_epi16(in##1, in##7); \
++  z12s = _mm_slli_epi16(z12s, PRE_MULTIPLY_SCALE_BITS); \
++  \
++  tmp11 = _mm_sub_epi16(z11, z13); \
++  tmp11 = _mm_slli_epi16(tmp11, PRE_MULTIPLY_SCALE_BITS); \
++  tmp11 = _mm_mulhi_epi16(tmp11, pw_F1414); \
++  \
++  tmp7 = _mm_add_epi16(z11, z13); \
++  \
++  /* To avoid overflow... \
++   * \
++   * (Original) \
++   * tmp12 = -2.613125930 * z10 + z5; \
++   * \
++   * (This implementation) \
++   * tmp12 = (-1.613125930 - 1) * z10 + z5; \
++   *       = -1.613125930 * z10 - z10 + z5; \
++   */ \
++  \
++  z5 = _mm_add_epi16(z10s, z12s); \
++  z5 = _mm_mulhi_epi16(z5, pw_F1847); \
++  \
++  tmp10 = _mm_mulhi_epi16(z12s, pw_F1082); \
++  tmp10 = _mm_sub_epi16(tmp10, z5); \
++  tmp12 = _mm_add_epi16(_mm_mulhi_epi16(z10s, pw_MF1613), z5); \
++  tmp12 = _mm_sub_epi16(tmp12, z10); \
++  \
++  tmp6 = _mm_sub_epi16(tmp12, tmp7); \
++  tmp5 = _mm_sub_epi16(tmp11, tmp6); \
++  tmp4 = _mm_add_epi16(tmp10, tmp5); \
++  \
++  out0 = _mm_add_epi16(tmp0, tmp7); \
++  out1 = _mm_add_epi16(tmp1, tmp6); \
++  out2 = _mm_add_epi16(tmp2, tmp5); \
++  out3 = _mm_sub_epi16(tmp3, tmp4); \
++  out4 = _mm_add_epi16(tmp3, tmp4); \
++  out5 = _mm_sub_epi16(tmp2, tmp5); \
++  out6 = _mm_sub_epi16(tmp1, tmp6); \
++  out7 = _mm_sub_epi16(tmp0, tmp7); \
++}
++
++
++void jsimd_idct_ifast_e2k(void *dct_table_, JCOEFPTR coef_block,
++                          JSAMPARRAY output_buf, JDIMENSION output_col)
++{
++  short *dct_table = (short *)dct_table_;
++
++  __m128i row0, row1, row2, row3, row4, row5, row6, row7,
++    col0, col1, col2, col3, col4, col5, col6, col7,
++    quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
++    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
++    z5, z10, z10s, z11, z12s, z13,
++    out0, out1, out2, out3, out4, out5, out6, out7;
++
++  /* Constants */
++  __m128i pw_F1414 = _mm_set1_epi16(F_1_414 << CONST_SHIFT),
++    pw_F1847 = _mm_set1_epi16(F_1_847 << CONST_SHIFT),
++    pw_MF1613 = _mm_set1_epi16(-F_1_613 << CONST_SHIFT),
++    pw_F1082 = _mm_set1_epi16(F_1_082 << CONST_SHIFT);
++
++  /* Pass 1: process columns */
++
++  col0 = VEC_LD(coef_block + 0 * 8);
++  col1 = VEC_LD(coef_block + 1 * 8);
++  col2 = VEC_LD(coef_block + 2 * 8);
++  col3 = VEC_LD(coef_block + 3 * 8);
++  col4 = VEC_LD(coef_block + 4 * 8);
++  col5 = VEC_LD(coef_block + 5 * 8);
++  col6 = VEC_LD(coef_block + 6 * 8);
++  col7 = VEC_LD(coef_block + 7 * 8);
++
++  tmp1 = _mm_or_si128(col1, col2);
++  tmp2 = _mm_or_si128(col3, col4);
++  tmp1 = _mm_or_si128(tmp1, tmp2);
++  tmp3 = _mm_or_si128(col5, col6);
++  tmp3 = _mm_or_si128(tmp3, col7);
++  tmp1 = _mm_or_si128(tmp1, tmp3);
++
++  quant0 = VEC_LD(dct_table);
++  col0 = _mm_mullo_epi16(col0, quant0);
++
++  if (VEC_ISZERO(tmp1)) {
++    /* AC terms all zero */
++
++    IDCT_SPLAT8(col0);
++
++  } else {
++
++    quant1 = VEC_LD(dct_table + 1 * 8);
++    quant2 = VEC_LD(dct_table + 2 * 8);
++    quant3 = VEC_LD(dct_table + 3 * 8);
++    quant4 = VEC_LD(dct_table + 4 * 8);
++    quant5 = VEC_LD(dct_table + 5 * 8);
++    quant6 = VEC_LD(dct_table + 6 * 8);
++    quant7 = VEC_LD(dct_table + 7 * 8);
++
++    col1 = _mm_mullo_epi16(col1, quant1);
++    col2 = _mm_mullo_epi16(col2, quant2);
++    col3 = _mm_mullo_epi16(col3, quant3);
++    col4 = _mm_mullo_epi16(col4, quant4);
++    col5 = _mm_mullo_epi16(col5, quant5);
++    col6 = _mm_mullo_epi16(col6, quant6);
++    col7 = _mm_mullo_epi16(col7, quant7);
++
++    DO_IDCT(col);
++
++    TRANSPOSE(out, row);
++  }
++
++  /* Pass 2: process rows */
++
++  DO_IDCT(row);
++
++  out0 = _mm_srai_epi16(out0, PASS1_BITS + 3);
++  out1 = _mm_srai_epi16(out1, PASS1_BITS + 3);
++  out2 = _mm_srai_epi16(out2, PASS1_BITS + 3);
++  out3 = _mm_srai_epi16(out3, PASS1_BITS + 3);
++  out4 = _mm_srai_epi16(out4, PASS1_BITS + 3);
++  out5 = _mm_srai_epi16(out5, PASS1_BITS + 3);
++  out6 = _mm_srai_epi16(out6, PASS1_BITS + 3);
++  out7 = _mm_srai_epi16(out7, PASS1_BITS + 3);
++
++  IDCT_SAVE();
++}
+diff --git a/simd/e2k/jidctint-e2k.c b/simd/e2k/jidctint-e2k.c
+new file mode 100644
+index 0000000..7bb79c0
+--- /dev/null
++++ b/simd/e2k/jidctint-e2k.c
+@@ -0,0 +1,294 @@
++/*
++ * Elbrus optimizations for libjpeg-turbo
++ *
++ * Copyright (C) 2014-2015, 2020, D. R. Commander.  All Rights Reserved.
++ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
++ *
++ * This software is provided 'as-is', without any express or implied
++ * warranty.  In no event will the authors be held liable for any damages
++ * arising from the use of this software.
++ *
++ * Permission is granted to anyone to use this software for any purpose,
++ * including commercial applications, and to alter it and redistribute it
++ * freely, subject to the following restrictions:
++ *
++ * 1. The origin of this software must not be misrepresented; you must not
++ *    claim that you wrote the original software. If you use this software
++ *    in a product, an acknowledgment in the product documentation would be
++ *    appreciated but is not required.
++ * 2. Altered source versions must be plainly marked as such, and must not be
++ *    misrepresented as being the original software.
++ * 3. This notice may not be removed or altered from any source distribution.
++ */
++
++/* ACCURATE INTEGER INVERSE DCT */
++
++#include "jsimd_e2k.h"
++
++
++#define F_0_298  2446   /* FIX(0.298631336) */
++#define F_0_390  3196   /* FIX(0.390180644) */
++#define F_0_541  4433   /* FIX(0.541196100) */
++#define F_0_765  6270   /* FIX(0.765366865) */
++#define F_0_899  7373   /* FIX(0.899976223) */
++#define F_1_175  9633   /* FIX(1.175875602) */
++#define F_1_501  12299  /* FIX(1.501321110) */
++#define F_1_847  15137  /* FIX(1.847759065) */
++#define F_1_961  16069  /* FIX(1.961570560) */
++#define F_2_053  16819  /* FIX(2.053119869) */
++#define F_2_562  20995  /* FIX(2.562915447) */
++#define F_3_072  25172  /* FIX(3.072711026) */
++
++#define CONST_BITS  13
++#define PASS1_BITS  2
++#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
++#define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
++
++
++#define DO_IDCT(in, PASS) { \
++  /* Even part \
++   * \
++   * (Original) \
++   * z1 = (z2 + z3) * 0.541196100; \
++   * tmp2 = z1 + z3 * -1.847759065; \
++   * tmp3 = z1 + z2 * 0.765366865; \
++   * \
++   * (This implementation) \
++   * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \
++   * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \
++   */ \
++  \
++  in##26l = _mm_unpacklo_epi16(in##2, in##6); \
++  in##26h = _mm_unpackhi_epi16(in##2, in##6); \
++  \
++  tmp3l = _mm_madd_epi16(in##26l, pw_f130_f054); \
++  tmp3h = _mm_madd_epi16(in##26h, pw_f130_f054); \
++  tmp2l = _mm_madd_epi16(in##26l, pw_f054_mf130); \
++  tmp2h = _mm_madd_epi16(in##26h, pw_f054_mf130); \
++  \
++  tmp0 = _mm_add_epi16(in##0, in##4); \
++  tmp1 = _mm_sub_epi16(in##0, in##4); \
++  \
++  tmp0l = _mm_unpacklo_epi16(pw_zero, tmp0); \
++  tmp0h = _mm_unpackhi_epi16(pw_zero, tmp0); \
++  tmp0l = _mm_srai_epi32(tmp0l, 16 - CONST_BITS); \
++  tmp0h = _mm_srai_epi32(tmp0h, 16 - CONST_BITS); \
++  tmp0l = _mm_add_epi32(tmp0l, pd_descale_p##PASS); \
++  tmp0h = _mm_add_epi32(tmp0h, pd_descale_p##PASS); \
++  \
++  tmp10l = _mm_add_epi32(tmp0l, tmp3l); \
++  tmp10h = _mm_add_epi32(tmp0h, tmp3h); \
++  tmp13l = _mm_sub_epi32(tmp0l, tmp3l); \
++  tmp13h = _mm_sub_epi32(tmp0h, tmp3h); \
++  \
++  tmp1l = _mm_unpacklo_epi16(pw_zero, tmp1); \
++  tmp1h = _mm_unpackhi_epi16(pw_zero, tmp1); \
++  tmp1l = _mm_srai_epi32(tmp1l, 16 - CONST_BITS); \
++  tmp1h = _mm_srai_epi32(tmp1h, 16 - CONST_BITS); \
++  tmp1l = _mm_add_epi32(tmp1l, pd_descale_p##PASS); \
++  tmp1h = _mm_add_epi32(tmp1h, pd_descale_p##PASS); \
++  \
++  tmp11l = _mm_add_epi32(tmp1l, tmp2l); \
++  tmp11h = _mm_add_epi32(tmp1h, tmp2h); \
++  tmp12l = _mm_sub_epi32(tmp1l, tmp2l); \
++  tmp12h = _mm_sub_epi32(tmp1h, tmp2h); \
++  \
++  /* Odd part */ \
++  \
++  z3 = _mm_add_epi16(in##3, in##7); \
++  z4 = _mm_add_epi16(in##1, in##5); \
++  \
++  /* (Original) \
++   * z5 = (z3 + z4) * 1.175875602; \
++   * z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644; \
++   * z3 += z5;  z4 += z5; \
++   * \
++   * (This implementation) \
++   * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
++   * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
++   */ \
++  \
++  z34l = _mm_unpacklo_epi16(z3, z4); \
++  z34h = _mm_unpackhi_epi16(z3, z4); \
++  \
++  z3l = _mm_madd_epi16(z34l, pw_mf078_f117); \
++  z3h = _mm_madd_epi16(z34h, pw_mf078_f117); \
++  z4l = _mm_madd_epi16(z34l, pw_f117_f078); \
++  z4h = _mm_madd_epi16(z34h, pw_f117_f078); \
++  \
++  /* (Original) \
++   * z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2; \
++   * tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869; \
++   * tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110; \
++   * z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447; \
++   * tmp0 += z1 + z3;  tmp1 += z2 + z4; \
++   * tmp2 += z2 + z3;  tmp3 += z1 + z4; \
++   * \
++   * (This implementation) \
++   * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; \
++   * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; \
++   * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); \
++   * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); \
++   * tmp0 += z3;  tmp1 += z4; \
++   * tmp2 += z3;  tmp3 += z4; \
++   */ \
++  \
++  in##71l = _mm_unpacklo_epi16(in##7, in##1); \
++  in##71h = _mm_unpackhi_epi16(in##7, in##1); \
++  \
++  tmp0l = _mm_add_epi32(_mm_madd_epi16(in##71l, pw_mf060_mf089), z3l); \
++  tmp0h = _mm_add_epi32(_mm_madd_epi16(in##71h, pw_mf060_mf089), z3h); \
++  tmp3l = _mm_add_epi32(_mm_madd_epi16(in##71l, pw_mf089_f060), z4l); \
++  tmp3h = _mm_add_epi32(_mm_madd_epi16(in##71h, pw_mf089_f060), z4h); \
++  \
++  in##53l = _mm_unpacklo_epi16(in##5, in##3); \
++  in##53h = _mm_unpackhi_epi16(in##5, in##3); \
++  \
++  tmp1l = _mm_add_epi32(_mm_madd_epi16(in##53l, pw_mf050_mf256), z4l); \
++  tmp1h = _mm_add_epi32(_mm_madd_epi16(in##53h, pw_mf050_mf256), z4h); \
++  tmp2l = _mm_add_epi32(_mm_madd_epi16(in##53l, pw_mf256_f050), z3l); \
++  tmp2h = _mm_add_epi32(_mm_madd_epi16(in##53h, pw_mf256_f050), z3h); \
++  \
++  /* Final output stage */ \
++  \
++  out0l = _mm_add_epi32(tmp10l, tmp3l); \
++  out0h = _mm_add_epi32(tmp10h, tmp3h); \
++  out7l = _mm_sub_epi32(tmp10l, tmp3l); \
++  out7h = _mm_sub_epi32(tmp10h, tmp3h); \
++  \
++  out0l = _mm_srai_epi32(out0l, DESCALE_P##PASS); \
++  out0h = _mm_srai_epi32(out0h, DESCALE_P##PASS); \
++  out7l = _mm_srai_epi32(out7l, DESCALE_P##PASS); \
++  out7h = _mm_srai_epi32(out7h, DESCALE_P##PASS); \
++  \
++  out0 = _mm_packs_epi32(out0l, out0h); \
++  out7 = _mm_packs_epi32(out7l, out7h); \
++  \
++  out1l = _mm_add_epi32(tmp11l, tmp2l); \
++  out1h = _mm_add_epi32(tmp11h, tmp2h); \
++  out6l = _mm_sub_epi32(tmp11l, tmp2l); \
++  out6h = _mm_sub_epi32(tmp11h, tmp2h); \
++  \
++  out1l = _mm_srai_epi32(out1l, DESCALE_P##PASS); \
++  out1h = _mm_srai_epi32(out1h, DESCALE_P##PASS); \
++  out6l = _mm_srai_epi32(out6l, DESCALE_P##PASS); \
++  out6h = _mm_srai_epi32(out6h, DESCALE_P##PASS); \
++  \
++  out1 = _mm_packs_epi32(out1l, out1h); \
++  out6 = _mm_packs_epi32(out6l, out6h); \
++  \
++  out2l = _mm_add_epi32(tmp12l, tmp1l); \
++  out2h = _mm_add_epi32(tmp12h, tmp1h); \
++  out5l = _mm_sub_epi32(tmp12l, tmp1l); \
++  out5h = _mm_sub_epi32(tmp12h, tmp1h); \
++  \
++  out2l = _mm_srai_epi32(out2l, DESCALE_P##PASS); \
++  out2h = _mm_srai_epi32(out2h, DESCALE_P##PASS); \
++  out5l = _mm_srai_epi32(out5l, DESCALE_P##PASS); \
++  out5h = _mm_srai_epi32(out5h, DESCALE_P##PASS); \
++  \
++  out2 = _mm_packs_epi32(out2l, out2h); \
++  out5 = _mm_packs_epi32(out5l, out5h); \
++  \
++  out3l = _mm_add_epi32(tmp13l, tmp0l); \
++  out3h = _mm_add_epi32(tmp13h, tmp0h); \
++  out4l = _mm_sub_epi32(tmp13l, tmp0l); \
++  out4h = _mm_sub_epi32(tmp13h, tmp0h); \
++  \
++  out3l = _mm_srai_epi32(out3l, DESCALE_P##PASS); \
++  out3h = _mm_srai_epi32(out3h, DESCALE_P##PASS); \
++  out4l = _mm_srai_epi32(out4l, DESCALE_P##PASS); \
++  out4h = _mm_srai_epi32(out4h, DESCALE_P##PASS); \
++  \
++  out3 = _mm_packs_epi32(out3l, out3h); \
++  out4 = _mm_packs_epi32(out4l, out4h); \
++}
++
++
++void jsimd_idct_islow_e2k(void *dct_table_, JCOEFPTR coef_block,
++                          JSAMPARRAY output_buf, JDIMENSION output_col)
++{
++  short *dct_table = (short *)dct_table_;
++
++  __m128i row0, row1, row2, row3, row4, row5, row6, row7,
++    col0, col1, col2, col3, col4, col5, col6, col7,
++    quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
++    tmp0, tmp1, tmp2, tmp3, z3, z4,
++    z34l, z34h, col71l, col71h, col26l, col26h, col53l, col53h,
++    row71l, row71h, row26l, row26h, row53l, row53h,
++    out0, out1, out2, out3, out4, out5, out6, out7;
++  __m128i tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h,
++    tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h,
++    z3l, z3h, z4l, z4h,
++    out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h, out4l, out4h,
++    out5l, out5h, out6l, out6h, out7l, out7h;
++
++  /* Constants */
++  __m128i pw_zero = _mm_setzero_si128(),
++    pw_f130_f054 = _mm_setr_epi16(__4X2(F_0_541 + F_0_765, F_0_541)),
++    pw_f054_mf130 = _mm_setr_epi16(__4X2(F_0_541, F_0_541 - F_1_847)),
++    pw_mf078_f117 = _mm_setr_epi16(__4X2(F_1_175 - F_1_961, F_1_175)),
++    pw_f117_f078 = _mm_setr_epi16(__4X2(F_1_175, F_1_175 - F_0_390)),
++    pw_mf060_mf089 = _mm_setr_epi16(__4X2(F_0_298 - F_0_899, -F_0_899)),
++    pw_mf089_f060 = _mm_setr_epi16(__4X2(-F_0_899, F_1_501 - F_0_899)),
++    pw_mf050_mf256 = _mm_setr_epi16(__4X2(F_2_053 - F_2_562, -F_2_562)),
++    pw_mf256_f050 = _mm_setr_epi16(__4X2(-F_2_562, F_3_072 - F_2_562)),
++    pd_descale_p1 = _mm_set1_epi32(1 << (DESCALE_P1 - 1)),
++    pd_descale_p2 = _mm_set1_epi32(1 << (DESCALE_P2 - 1));
++
++  /* Pass 1: process columns */
++
++  col0 = VEC_LD(coef_block + 0 * 8);
++  col1 = VEC_LD(coef_block + 1 * 8);
++  col2 = VEC_LD(coef_block + 2 * 8);
++  col3 = VEC_LD(coef_block + 3 * 8);
++  col4 = VEC_LD(coef_block + 4 * 8);
++  col5 = VEC_LD(coef_block + 5 * 8);
++  col6 = VEC_LD(coef_block + 6 * 8);
++  col7 = VEC_LD(coef_block + 7 * 8);
++
++  tmp1 = _mm_or_si128(col1, col2);
++  tmp2 = _mm_or_si128(col3, col4);
++  tmp1 = _mm_or_si128(tmp1, tmp2);
++  tmp3 = _mm_or_si128(col5, col6);
++  tmp3 = _mm_or_si128(tmp3, col7);
++  tmp1 = _mm_or_si128(tmp1, tmp3);
++
++  quant0 = VEC_LD(dct_table);
++  col0 = _mm_mullo_epi16(col0, quant0);
++
++  if (VEC_ISZERO(tmp1)) {
++    /* AC terms all zero */
++
++    col0 = _mm_slli_epi16(col0, PASS1_BITS);
++    IDCT_SPLAT8(col0);
++
++  } else {
++
++    quant1 = VEC_LD(dct_table + 1 * 8);
++    quant2 = VEC_LD(dct_table + 2 * 8);
++    quant3 = VEC_LD(dct_table + 3 * 8);
++    quant4 = VEC_LD(dct_table + 4 * 8);
++    quant5 = VEC_LD(dct_table + 5 * 8);
++    quant6 = VEC_LD(dct_table + 6 * 8);
++    quant7 = VEC_LD(dct_table + 7 * 8);
++
++    col1 = _mm_mullo_epi16(col1, quant1);
++    col2 = _mm_mullo_epi16(col2, quant2);
++    col3 = _mm_mullo_epi16(col3, quant3);
++    col4 = _mm_mullo_epi16(col4, quant4);
++    col5 = _mm_mullo_epi16(col5, quant5);
++    col6 = _mm_mullo_epi16(col6, quant6);
++    col7 = _mm_mullo_epi16(col7, quant7);
++
++    DO_IDCT(col, 1);
++
++    TRANSPOSE(out, row);
++  }
++
++  /* Pass 2: process rows */
++
++  DO_IDCT(row, 2);
++
++  IDCT_SAVE();
++}
+diff --git a/simd/e2k/jquantf-e2k.c b/simd/e2k/jquantf-e2k.c
+new file mode 100644
+index 0000000..106e99a
+--- /dev/null
++++ b/simd/e2k/jquantf-e2k.c
+@@ -0,0 +1,121 @@
++/*
++ * Elbrus optimizations for libjpeg-turbo
++ *
++ * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
++ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
++ *
++ * This software is provided 'as-is', without any express or implied
++ * warranty.  In no event will the authors be held liable for any damages
++ * arising from the use of this software.
++ *
++ * Permission is granted to anyone to use this software for any purpose,
++ * including commercial applications, and to alter it and redistribute it
++ * freely, subject to the following restrictions:
++ *
++ * 1. The origin of this software must not be misrepresented; you must not
++ *    claim that you wrote the original software. If you use this software
++ *    in a product, an acknowledgment in the product documentation would be
++ *    appreciated but is not required.
++ * 2. Altered source versions must be plainly marked as such, and must not be
++ *    misrepresented as being the original software.
++ * 3. This notice may not be removed or altered from any source distribution.
++ */
++
++/* FLOAT QUANTIZATION AND SAMPLE CONVERSION */
++
++#include "jsimd_e2k.h"
++
++#define LOAD_ROW(row) in##row = VEC_LD8(sample_data[row] + start_col)
++#define STORE_ROW(i) \
++  in0 = _mm_unpacklo_epi16(out##i, pb_zero); \
++  in1 = _mm_unpackhi_epi16(out##i, pb_zero); \
++  in0 = _mm_sub_epi32(in0, pd_cj); \
++  in1 = _mm_sub_epi32(in1, pd_cj); \
++  _mm_storeu_ps(workspace + i * 8, _mm_cvtepi32_ps(in0)); \
++  _mm_storeu_ps(workspace + i * 8 + 4, _mm_cvtepi32_ps(in1));
++
++void jsimd_convsamp_float_e2k(JSAMPARRAY sample_data, JDIMENSION start_col,
++                              FAST_FLOAT *workspace)
++{
++  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
++  __m128i out0, out1, out2, out3, out4, out5, out6, out7;
++
++  /* Constants */
++  __m128i pd_cj = _mm_set1_epi32(CENTERJSAMPLE),
++    pb_zero = _mm_setzero_si128();
++
++  LOAD_ROW(0);
++  LOAD_ROW(1);
++  LOAD_ROW(2);
++  LOAD_ROW(3);
++  LOAD_ROW(4);
++  LOAD_ROW(5);
++  LOAD_ROW(6);
++  LOAD_ROW(7);
++
++  out0 = _mm_unpacklo_epi8(in0, pb_zero);
++  out1 = _mm_unpacklo_epi8(in1, pb_zero);
++  out2 = _mm_unpacklo_epi8(in2, pb_zero);
++  out3 = _mm_unpacklo_epi8(in3, pb_zero);
++  out4 = _mm_unpacklo_epi8(in4, pb_zero);
++  out5 = _mm_unpacklo_epi8(in5, pb_zero);
++  out6 = _mm_unpacklo_epi8(in6, pb_zero);
++  out7 = _mm_unpacklo_epi8(in7, pb_zero);
++
++  STORE_ROW(0)
++  STORE_ROW(1)
++  STORE_ROW(2)
++  STORE_ROW(3)
++  STORE_ROW(4)
++  STORE_ROW(5)
++  STORE_ROW(6)
++  STORE_ROW(7)
++}
++
++void jsimd_quantize_float_e2k(JCOEFPTR coef_block, FAST_FLOAT *divisors,
++                              FAST_FLOAT *workspace)
++{
++  int i = 0;
++  __m128 row0, row1, row2, row3, recip0, recip1, recip2, recip3;
++  __m128i out0, out1;
++#ifdef JSIMD_SAME_ROUNDING
++  __m128 pd_f16k5 = _mm_set1_ps(16384.5f);
++  __m128i pw_m16k = _mm_set1_epi16(-16384);
++#endif
++
++  PRAGMA_E2K("ivdep")
++  for (; i < 4; i++, workspace += 16, divisors += 16, coef_block += 16) {
++    row0 = _mm_loadu_ps(workspace + 0 * 4);
++    row1 = _mm_loadu_ps(workspace + 1 * 4);
++    row2 = _mm_loadu_ps(workspace + 2 * 4);
++    row3 = _mm_loadu_ps(workspace + 3 * 4);
++
++    recip0 = _mm_loadu_ps(divisors + 0 * 4);
++    recip1 = _mm_loadu_ps(divisors + 1 * 4);
++    recip2 = _mm_loadu_ps(divisors + 2 * 4);
++    recip3 = _mm_loadu_ps(divisors + 3 * 4);
++
++    row0 = _mm_mul_ps(row0, recip0);
++    row1 = _mm_mul_ps(row1, recip1);
++    row2 = _mm_mul_ps(row2, recip2);
++    row3 = _mm_mul_ps(row3, recip3);
++
++#ifdef JSIMD_SAME_ROUNDING
++    row0 = _mm_add_ps(row0, pd_f16k5);
++    row1 = _mm_add_ps(row1, pd_f16k5);
++    row2 = _mm_add_ps(row2, pd_f16k5);
++    row3 = _mm_add_ps(row3, pd_f16k5);
++
++    out0 = _mm_packs_epi32(_mm_cvttps_epi32(row0), _mm_cvttps_epi32(row1));
++    out1 = _mm_packs_epi32(_mm_cvttps_epi32(row2), _mm_cvttps_epi32(row3));
++
++    out0 = _mm_add_epi16(out0, pw_m16k);
++    out1 = _mm_add_epi16(out1, pw_m16k);
++#else
++    out0 = _mm_packs_epi32(_mm_cvtps_epi32(row0), _mm_cvtps_epi32(row1));
++    out1 = _mm_packs_epi32(_mm_cvtps_epi32(row2), _mm_cvtps_epi32(row3));
++#endif
++    VEC_ST(coef_block, out0);
++    VEC_ST(coef_block + 8, out1);
++  }
++}
+diff --git a/simd/e2k/jquanti-e2k.c b/simd/e2k/jquanti-e2k.c
+new file mode 100644
+index 0000000..a3e1ff1
+--- /dev/null
++++ b/simd/e2k/jquanti-e2k.c
+@@ -0,0 +1,178 @@
++/*
++ * Elbrus optimizations for libjpeg-turbo
++ *
++ * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
++ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
++ *
++ * This software is provided 'as-is', without any express or implied
++ * warranty.  In no event will the authors be held liable for any damages
++ * arising from the use of this software.
++ *
++ * Permission is granted to anyone to use this software for any purpose,
++ * including commercial applications, and to alter it and redistribute it
++ * freely, subject to the following restrictions:
++ *
++ * 1. The origin of this software must not be misrepresented; you must not
++ *    claim that you wrote the original software. If you use this software
++ *    in a product, an acknowledgment in the product documentation would be
++ *    appreciated but is not required.
++ * 2. Altered source versions must be plainly marked as such, and must not be
++ *    misrepresented as being the original software.
++ * 3. This notice may not be removed or altered from any source distribution.
++ */
++
++/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */
++
++#include "jsimd_e2k.h"
++
++#define LOAD_ROW(row) in##row = VEC_LD8(sample_data[row] + start_col)
++
++void jsimd_convsamp_e2k(JSAMPARRAY sample_data, JDIMENSION start_col,
++                        DCTELEM *workspace)
++{
++  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
++  __m128i out0, out1, out2, out3, out4, out5, out6, out7;
++
++  /* Constants */
++  __m128i pw_cj = _mm_set1_epi16(CENTERJSAMPLE),
++    pb_zero = _mm_setzero_si128();
++
++  LOAD_ROW(0);
++  LOAD_ROW(1);
++  LOAD_ROW(2);
++  LOAD_ROW(3);
++  LOAD_ROW(4);
++  LOAD_ROW(5);
++  LOAD_ROW(6);
++  LOAD_ROW(7);
++
++  out0 = _mm_unpacklo_epi8(in0, pb_zero);
++  out1 = _mm_unpacklo_epi8(in1, pb_zero);
++  out2 = _mm_unpacklo_epi8(in2, pb_zero);
++  out3 = _mm_unpacklo_epi8(in3, pb_zero);
++  out4 = _mm_unpacklo_epi8(in4, pb_zero);
++  out5 = _mm_unpacklo_epi8(in5, pb_zero);
++  out6 = _mm_unpacklo_epi8(in6, pb_zero);
++  out7 = _mm_unpacklo_epi8(in7, pb_zero);
++
++  out0 = _mm_sub_epi16(out0, pw_cj);
++  out1 = _mm_sub_epi16(out1, pw_cj);
++  out2 = _mm_sub_epi16(out2, pw_cj);
++  out3 = _mm_sub_epi16(out3, pw_cj);
++  out4 = _mm_sub_epi16(out4, pw_cj);
++  out5 = _mm_sub_epi16(out5, pw_cj);
++  out6 = _mm_sub_epi16(out6, pw_cj);
++  out7 = _mm_sub_epi16(out7, pw_cj);
++
++  VEC_ST(workspace + 0 * 8, out0);
++  VEC_ST(workspace + 1 * 8, out1);
++  VEC_ST(workspace + 2 * 8, out2);
++  VEC_ST(workspace + 3 * 8, out3);
++  VEC_ST(workspace + 4 * 8, out4);
++  VEC_ST(workspace + 5 * 8, out5);
++  VEC_ST(workspace + 6 * 8, out6);
++  VEC_ST(workspace + 7 * 8, out7);
++}
++
++
++#define WORD_BIT  16
++#define MULTIPLY(vs0, vs1, out) out = _mm_mulhi_epu16(vs0, vs1)
++
++void jsimd_quantize_e2k(JCOEFPTR coef_block, DCTELEM *divisors,
++                        DCTELEM *workspace)
++{
++  __m128i row0, row1, row2, row3, row4, row5, row6, row7,
++    row0s, row1s, row2s, row3s, row4s, row5s, row6s, row7s,
++    corr0, corr1, corr2, corr3, corr4, corr5, corr6, corr7,
++    recip0, recip1, recip2, recip3, recip4, recip5, recip6, recip7,
++    scale0, scale1, scale2, scale3, scale4, scale5, scale6, scale7;
++
++  row0s = VEC_LD(workspace + 0 * 8);
++  row1s = VEC_LD(workspace + 1 * 8);
++  row2s = VEC_LD(workspace + 2 * 8);
++  row3s = VEC_LD(workspace + 3 * 8);
++  row4s = VEC_LD(workspace + 4 * 8);
++  row5s = VEC_LD(workspace + 5 * 8);
++  row6s = VEC_LD(workspace + 6 * 8);
++  row7s = VEC_LD(workspace + 7 * 8);
++  row0 = _mm_abs_epi16(row0s);
++  row1 = _mm_abs_epi16(row1s);
++  row2 = _mm_abs_epi16(row2s);
++  row3 = _mm_abs_epi16(row3s);
++  row4 = _mm_abs_epi16(row4s);
++  row5 = _mm_abs_epi16(row5s);
++  row6 = _mm_abs_epi16(row6s);
++  row7 = _mm_abs_epi16(row7s);
++
++  corr0 = VEC_LD(divisors + DCTSIZE2 + 0 * 8);
++  corr1 = VEC_LD(divisors + DCTSIZE2 + 1 * 8);
++  corr2 = VEC_LD(divisors + DCTSIZE2 + 2 * 8);
++  corr3 = VEC_LD(divisors + DCTSIZE2 + 3 * 8);
++  corr4 = VEC_LD(divisors + DCTSIZE2 + 4 * 8);
++  corr5 = VEC_LD(divisors + DCTSIZE2 + 5 * 8);
++  corr6 = VEC_LD(divisors + DCTSIZE2 + 6 * 8);
++  corr7 = VEC_LD(divisors + DCTSIZE2 + 7 * 8);
++
++  row0 = _mm_add_epi16(row0, corr0);
++  row1 = _mm_add_epi16(row1, corr1);
++  row2 = _mm_add_epi16(row2, corr2);
++  row3 = _mm_add_epi16(row3, corr3);
++  row4 = _mm_add_epi16(row4, corr4);
++  row5 = _mm_add_epi16(row5, corr5);
++  row6 = _mm_add_epi16(row6, corr6);
++  row7 = _mm_add_epi16(row7, corr7);
++
++  recip0 = VEC_LD(divisors + 0 * 8);
++  recip1 = VEC_LD(divisors + 1 * 8);
++  recip2 = VEC_LD(divisors + 2 * 8);
++  recip3 = VEC_LD(divisors + 3 * 8);
++  recip4 = VEC_LD(divisors + 4 * 8);
++  recip5 = VEC_LD(divisors + 5 * 8);
++  recip6 = VEC_LD(divisors + 6 * 8);
++  recip7 = VEC_LD(divisors + 7 * 8);
++
++  MULTIPLY(row0, recip0, row0);
++  MULTIPLY(row1, recip1, row1);
++  MULTIPLY(row2, recip2, row2);
++  MULTIPLY(row3, recip3, row3);
++  MULTIPLY(row4, recip4, row4);
++  MULTIPLY(row5, recip5, row5);
++  MULTIPLY(row6, recip6, row6);
++  MULTIPLY(row7, recip7, row7);
++
++  scale0 = VEC_LD(divisors + DCTSIZE2 * 2 + 0 * 8);
++  scale1 = VEC_LD(divisors + DCTSIZE2 * 2 + 1 * 8);
++  scale2 = VEC_LD(divisors + DCTSIZE2 * 2 + 2 * 8);
++  scale3 = VEC_LD(divisors + DCTSIZE2 * 2 + 3 * 8);
++  scale4 = VEC_LD(divisors + DCTSIZE2 * 2 + 4 * 8);
++  scale5 = VEC_LD(divisors + DCTSIZE2 * 2 + 5 * 8);
++  scale6 = VEC_LD(divisors + DCTSIZE2 * 2 + 6 * 8);
++  scale7 = VEC_LD(divisors + DCTSIZE2 * 2 + 7 * 8);
++
++  MULTIPLY(row0, scale0, row0);
++  MULTIPLY(row1, scale1, row1);
++  MULTIPLY(row2, scale2, row2);
++  MULTIPLY(row3, scale3, row3);
++  MULTIPLY(row4, scale4, row4);
++  MULTIPLY(row5, scale5, row5);
++  MULTIPLY(row6, scale6, row6);
++  MULTIPLY(row7, scale7, row7);
++
++  row0 = _mm_sign_epi16(row0, row0s);
++  row1 = _mm_sign_epi16(row1, row1s);
++  row2 = _mm_sign_epi16(row2, row2s);
++  row3 = _mm_sign_epi16(row3, row3s);
++  row4 = _mm_sign_epi16(row4, row4s);
++  row5 = _mm_sign_epi16(row5, row5s);
++  row6 = _mm_sign_epi16(row6, row6s);
++  row7 = _mm_sign_epi16(row7, row7s);
++
++  VEC_ST(coef_block + 0 * 8, row0);
++  VEC_ST(coef_block + 1 * 8, row1);
++  VEC_ST(coef_block + 2 * 8, row2);
++  VEC_ST(coef_block + 3 * 8, row3);
++  VEC_ST(coef_block + 4 * 8, row4);
++  VEC_ST(coef_block + 5 * 8, row5);
++  VEC_ST(coef_block + 6 * 8, row6);
++  VEC_ST(coef_block + 7 * 8, row7);
++}
+diff --git a/simd/e2k/jsimd.c b/simd/e2k/jsimd.c
+new file mode 100644
+index 0000000..f8c0465
+--- /dev/null
++++ b/simd/e2k/jsimd.c
+@@ -0,0 +1,761 @@
++/*
++ * jsimd_e2k.c
++ *
++ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
++ * Copyright (C) 2009-2011, 2014-2016, 2018, D. R. Commander.
++ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
++ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
++ *
++ * Based on the x86 SIMD extension for IJG JPEG library,
++ * Copyright (C) 1999-2006, MIYASAKA Masaru.
++ * For conditions of distribution and use, see copyright notice in jsimdext.inc
++ *
++ * This file contains the interface between the "normal" portions
++ * of the library and the SIMD implementations when running on a
++ * PowerPC architecture.
++ */
++
++#define JPEG_INTERNALS
++#include "../../jinclude.h"
++#include "../../jpeglib.h"
++#include "../../jsimd.h"
++#include "../../jdct.h"
++#include "../../jsimddct.h"
++#include "../jsimd.h"
++#include "jsimd_api_e2k.h"
++
++static unsigned int simd_support = ~0;
++static unsigned int simd_huffman = 1;
++
++/*
++ * Check what SIMD accelerations are supported.
++ *
++ * FIXME: This code is racy under a multi-threaded environment.
++ */
++LOCAL(void)
++init_simd(void)
++{
++#ifndef NO_GETENV
++  char *env = NULL;
++#endif
++
++  if (simd_support != ~0U)
++    return;
++
++  simd_support = JSIMD_SSE2;
++
++#ifndef NO_GETENV
++  /* Force different settings through environment variables */
++  env = getenv("JSIMD_FORCENONE");
++  if ((env != NULL) && (strcmp(env, "1") == 0))
++    simd_support = 0;
++  env = getenv("JSIMD_NOHUFFENC");
++  if ((env != NULL) && (strcmp(env, "1") == 0))
++    simd_huffman = 0;
++#endif
++}
++
++static inline int color_space_idx(J_COLOR_SPACE color_space) {
++  switch (color_space) {
++  case JCS_EXT_RGB:
++    return 1 + (EXT_RGB_PIXELSIZE != 3) * 16;
++  case JCS_EXT_RGBX:
++  case JCS_EXT_RGBA:
++    return 2 + (EXT_RGBX_PIXELSIZE != 3) * 16;
++  case JCS_EXT_BGR:
++    return 3 + (EXT_BGR_PIXELSIZE != 3) * 16;
++  case JCS_EXT_BGRX:
++  case JCS_EXT_BGRA:
++    return 4 + (EXT_BGRX_PIXELSIZE != 3) * 16;
++  case JCS_EXT_XBGR:
++  case JCS_EXT_ABGR:
++    return 5 + (EXT_XBGR_PIXELSIZE != 3) * 16;
++  case JCS_EXT_XRGB:
++  case JCS_EXT_ARGB:
++    return 6 + (EXT_XRGB_PIXELSIZE != 3) * 16;
++  default:
++    break;
++  }
++  return 0 + (RGB_PIXELSIZE != 3) * 16;
++}
++
++GLOBAL(int)
++jsimd_can_rgb_ycc(void)
++{
++  init_simd();
++
++  /* The code is optimised for these values only */
++  if (BITS_IN_JSAMPLE != 8)
++    return 0;
++  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
++    return 0;
++
++  if (simd_support & JSIMD_SSE2)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(int)
++jsimd_can_rgb_gray(void)
++{
++  init_simd();
++
++  /* The code is optimised for these values only */
++  if (BITS_IN_JSAMPLE != 8)
++    return 0;
++  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
++    return 0;
++
++  if (simd_support & JSIMD_SSE2)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(int)
++jsimd_can_ycc_rgb(void)
++{
++  init_simd();
++
++  /* The code is optimised for these values only */
++  if (BITS_IN_JSAMPLE != 8)
++    return 0;
++  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
++    return 0;
++
++  if (simd_support & JSIMD_SSE2)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(int)
++jsimd_can_ycc_rgb565(void)
++{
++  return 0;
++}
++
++GLOBAL(void)
++jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
++                      JSAMPIMAGE output_buf, JDIMENSION output_row,
++                      int num_rows)
++{
++  void (*e2kfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int, int);
++  int idx = color_space_idx(cinfo->in_color_space);
++
++  e2kfct = idx < 16 ? jsimd_rgb3_ycc_convert_e2k :
++                      jsimd_rgb4_ycc_convert_e2k;
++  idx &= 15;
++
++  e2kfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows, idx);
++}
++
++GLOBAL(void)
++jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
++                       JSAMPIMAGE output_buf, JDIMENSION output_row,
++                       int num_rows)
++{
++  void (*e2kfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int, int);
++  int idx = color_space_idx(cinfo->in_color_space);
++
++  e2kfct = idx < 16 ? jsimd_rgb3_gray_convert_e2k :
++                      jsimd_rgb4_gray_convert_e2k;
++  idx &= 15;
++
++  e2kfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows, idx);
++}
++
++GLOBAL(void)
++jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
++                      JDIMENSION input_row, JSAMPARRAY output_buf,
++                      int num_rows)
++{
++  void (*e2kfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int, int);
++  int idx = color_space_idx(cinfo->out_color_space);
++
++  e2kfct = idx < 16 ? jsimd_ycc_rgb3_convert_e2k :
++                      jsimd_ycc_rgb4_convert_e2k;
++  idx &= 15;
++
++  e2kfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows, idx);
++}
++
++GLOBAL(void)
++jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
++                         JDIMENSION input_row, JSAMPARRAY output_buf,
++                         int num_rows)
++{
++}
++
++GLOBAL(int)
++jsimd_can_h2v2_downsample(void)
++{
++  init_simd();
++
++  /* The code is optimised for these values only */
++  if (BITS_IN_JSAMPLE != 8)
++    return 0;
++
++  if (simd_support & JSIMD_SSE2)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(int)
++jsimd_can_h2v1_downsample(void)
++{
++  init_simd();
++
++  /* The code is optimised for these values only */
++  if (BITS_IN_JSAMPLE != 8)
++    return 0;
++
++  if (simd_support & JSIMD_SSE2)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(void)
++jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
++                      JSAMPARRAY input_data, JSAMPARRAY output_data)
++{
++  jsimd_h2v2_downsample_e2k(cinfo->image_width, cinfo->max_v_samp_factor,
++                                compptr->v_samp_factor,
++                                compptr->width_in_blocks, input_data,
++                                output_data);
++}
++
++GLOBAL(void)
++jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
++                      JSAMPARRAY input_data, JSAMPARRAY output_data)
++{
++  jsimd_h2v1_downsample_e2k(cinfo->image_width, cinfo->max_v_samp_factor,
++                                compptr->v_samp_factor,
++                                compptr->width_in_blocks, input_data,
++                                output_data);
++}
++
++GLOBAL(int)
++jsimd_can_h2v2_upsample(void)
++{
++  init_simd();
++
++  /* The code is optimised for these values only */
++  if (BITS_IN_JSAMPLE != 8)
++    return 0;
++
++  if (simd_support & JSIMD_SSE2)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(int)
++jsimd_can_h2v1_upsample(void)
++{
++  init_simd();
++
++  /* The code is optimised for these values only */
++  if (BITS_IN_JSAMPLE != 8)
++    return 0;
++
++  if (simd_support & JSIMD_SSE2)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(void)
++jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
++                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
++{
++  jsimd_h2v2_upsample_e2k(cinfo->max_v_samp_factor, cinfo->output_width,
++                          input_data, output_data_ptr);
++}
++
++GLOBAL(void)
++jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
++                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
++{
++  jsimd_h2v1_upsample_e2k(cinfo->max_v_samp_factor, cinfo->output_width,
++                          input_data, output_data_ptr);
++}
++
++GLOBAL(int)
++jsimd_can_h2v2_fancy_upsample(void)
++{
++  init_simd();
++
++  /* The code is optimised for these values only */
++  if (BITS_IN_JSAMPLE != 8)
++    return 0;
++
++  if (simd_support & JSIMD_SSE2)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(int)
++jsimd_can_h2v1_fancy_upsample(void)
++{
++  init_simd();
++
++  /* The code is optimised for these values only */
++  if (BITS_IN_JSAMPLE != 8)
++    return 0;
++
++  if (simd_support & JSIMD_SSE2)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(void)
++jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
++                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
++{
++  jsimd_h2v2_fancy_upsample_e2k(cinfo->max_v_samp_factor,
++                                compptr->downsampled_width, input_data,
++                                output_data_ptr);
++}
++
++GLOBAL(void)
++jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
++                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
++{
++  jsimd_h2v1_fancy_upsample_e2k(cinfo->max_v_samp_factor,
++                                compptr->downsampled_width, input_data,
++                                output_data_ptr);
++}
++
++GLOBAL(int)
++jsimd_can_h2v2_merged_upsample(void)
++{
++  init_simd();
++
++  /* The code is optimised for these values only */
++  if (BITS_IN_JSAMPLE != 8)
++    return 0;
++
++  if (simd_support & JSIMD_SSE2)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(int)
++jsimd_can_h2v1_merged_upsample(void)
++{
++  init_simd();
++
++  /* The code is optimised for these values only */
++  if (BITS_IN_JSAMPLE != 8)
++    return 0;
++
++  if (simd_support & JSIMD_SSE2)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(void)
++jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
++                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
++{
++  void (*e2kfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JDIMENSION, JSAMPARRAY, int);
++  int idx = color_space_idx(cinfo->out_color_space);
++
++  e2kfct = idx < 16 ? jsimd_ycc_rgb3_merged_upsample_e2k :
++                      jsimd_ycc_rgb4_merged_upsample_e2k;
++  idx &= 15;
++
++  e2kfct(cinfo->output_width, input_buf, in_row_group_ctr,
++         in_row_group_ctr * 2, output_buf, idx);
++  e2kfct(cinfo->output_width, input_buf, in_row_group_ctr,
++         in_row_group_ctr * 2 + 1, output_buf + 1, idx);
++}
++
++GLOBAL(void)
++jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
++                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
++{
++  void (*e2kfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JDIMENSION, JSAMPARRAY, int);
++  int idx = color_space_idx(cinfo->out_color_space);
++
++  e2kfct = idx < 16 ? jsimd_ycc_rgb3_merged_upsample_e2k :
++                      jsimd_ycc_rgb4_merged_upsample_e2k;
++  idx &= 15;
++
++  e2kfct(cinfo->output_width, input_buf, in_row_group_ctr,
++         in_row_group_ctr, output_buf, idx);
++}
++
++GLOBAL(int)
++jsimd_can_convsamp(void)
++{
++  init_simd();
++
++  /* The code is optimised for these values only */
++  if (DCTSIZE != 8)
++    return 0;
++  if (BITS_IN_JSAMPLE != 8)
++    return 0;
++  if (sizeof(DCTELEM) != 2)
++    return 0;
++
++  if (simd_support & JSIMD_SSE2)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(int)
++jsimd_can_convsamp_float(void)
++{
++  init_simd();
++
++  /* The code is optimised for these values only */
++  if (DCTSIZE != 8)
++    return 0;
++  if (BITS_IN_JSAMPLE != 8)
++    return 0;
++  if (sizeof(FAST_FLOAT) != 4)
++    return 0;
++
++  if (simd_support & JSIMD_SSE2)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(void)
++jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
++               DCTELEM *workspace)
++{
++  jsimd_convsamp_e2k(sample_data, start_col, workspace);
++}
++
++GLOBAL(void)
++jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
++                     FAST_FLOAT *workspace)
++{
++  jsimd_convsamp_float_e2k(sample_data, start_col, workspace);
++}
++
++GLOBAL(int)
++jsimd_can_fdct_islow(void)
++{
++  init_simd();
++
++  /* The code is optimised for these values only */
++  if (DCTSIZE != 8)
++    return 0;
++  if (sizeof(DCTELEM) != 2)
++    return 0;
++
++  if (simd_support & JSIMD_SSE2)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(int)
++jsimd_can_fdct_ifast(void)
++{
++  init_simd();
++
++  /* The code is optimised for these values only */
++  if (DCTSIZE != 8)
++    return 0;
++  if (sizeof(DCTELEM) != 2)
++    return 0;
++
++  if (simd_support & JSIMD_SSE2)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(int)
++jsimd_can_fdct_float(void)
++{
++  init_simd();
++
++  /* The code is optimised for these values only */
++  if (DCTSIZE != 8)
++    return 0;
++  if (sizeof(FAST_FLOAT) != 4)
++    return 0;
++
++  if (simd_support & JSIMD_SSE2)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(void)
++jsimd_fdct_islow(DCTELEM *data)
++{
++  jsimd_fdct_islow_e2k(data);
++}
++
++GLOBAL(void)
++jsimd_fdct_ifast(DCTELEM *data)
++{
++  jsimd_fdct_ifast_e2k(data);
++}
++
++GLOBAL(void)
++jsimd_fdct_float(FAST_FLOAT *data)
++{
++  jsimd_fdct_float_e2k(data);
++}
++
++GLOBAL(int)
++jsimd_can_quantize(void)
++{
++  init_simd();
++
++  /* The code is optimised for these values only */
++  if (DCTSIZE != 8)
++    return 0;
++  if (sizeof(JCOEF) != 2)
++    return 0;
++  if (sizeof(DCTELEM) != 2)
++    return 0;
++
++  if (simd_support & JSIMD_SSE2)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(int)
++jsimd_can_quantize_float(void)
++{
++  init_simd();
++
++  /* The code is optimised for these values only */
++  if (DCTSIZE != 8)
++    return 0;
++  if (sizeof(JCOEF) != 2)
++    return 0;
++  if (sizeof(FAST_FLOAT) != 4)
++    return 0;
++
++  if (simd_support & JSIMD_SSE2)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(void)
++jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
++{
++  jsimd_quantize_e2k(coef_block, divisors, workspace);
++}
++
++GLOBAL(void)
++jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
++                     FAST_FLOAT *workspace)
++{
++  jsimd_quantize_float_e2k(coef_block, divisors, workspace);
++}
++
++GLOBAL(int)
++jsimd_can_idct_2x2(void)
++{
++  return 0;
++}
++
++GLOBAL(int)
++jsimd_can_idct_4x4(void)
++{
++  return 0;
++}
++
++GLOBAL(void)
++jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
++               JCOEFPTR coef_block, JSAMPARRAY output_buf,
++               JDIMENSION output_col)
++{
++}
++
++GLOBAL(void)
++jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
++               JCOEFPTR coef_block, JSAMPARRAY output_buf,
++               JDIMENSION output_col)
++{
++}
++
++GLOBAL(int)
++jsimd_can_idct_islow(void)
++{
++  init_simd();
++
++  /* The code is optimised for these values only */
++  if (DCTSIZE != 8)
++    return 0;
++  if (sizeof(JCOEF) != 2)
++    return 0;
++  if (BITS_IN_JSAMPLE != 8)
++    return 0;
++  if (sizeof(ISLOW_MULT_TYPE) != 2)
++    return 0;
++
++  if (simd_support & JSIMD_SSE2)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(int)
++jsimd_can_idct_ifast(void)
++{
++  init_simd();
++
++  /* The code is optimised for these values only */
++  if (DCTSIZE != 8)
++    return 0;
++  if (sizeof(JCOEF) != 2)
++    return 0;
++  if (BITS_IN_JSAMPLE != 8)
++    return 0;
++  if (sizeof(IFAST_MULT_TYPE) != 2)
++    return 0;
++  if (IFAST_SCALE_BITS != 2)
++    return 0;
++
++  if (simd_support & JSIMD_SSE2)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(int)
++jsimd_can_idct_float(void)
++{
++  init_simd();
++
++  /* The code is optimised for these values only */
++  if (DCTSIZE != 8)
++    return 0;
++  if (sizeof(JCOEF) != 2)
++    return 0;
++  if (BITS_IN_JSAMPLE != 8)
++    return 0;
++  if (sizeof(FAST_FLOAT) != 4)
++    return 0;
++  if (sizeof(FLOAT_MULT_TYPE) != 4)
++    return 0;
++
++  if (simd_support & JSIMD_SSE2)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(void)
++jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
++                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
++                 JDIMENSION output_col)
++{
++  jsimd_idct_islow_e2k(compptr->dct_table, coef_block, output_buf,
++                       output_col);
++}
++
++GLOBAL(void)
++jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
++                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
++                 JDIMENSION output_col)
++{
++  jsimd_idct_ifast_e2k(compptr->dct_table, coef_block, output_buf,
++                       output_col);
++}
++
++GLOBAL(void)
++jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
++                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
++                 JDIMENSION output_col)
++{
++  jsimd_idct_float_e2k(compptr->dct_table, coef_block, output_buf,
++                       output_col);
++}
++
++GLOBAL(int)
++jsimd_can_huff_encode_one_block(void)
++{
++  init_simd();
++
++  if (DCTSIZE != 8)
++    return 0;
++  if (sizeof(JCOEF) != 2)
++    return 0;
++
++  if ((simd_support & JSIMD_SSE2) && simd_huffman)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(JOCTET *)
++jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
++                            int last_dc_val, c_derived_tbl *dctbl,
++                            c_derived_tbl *actbl)
++{
++  return jsimd_huff_encode_one_block_e2k(state, buffer, block, last_dc_val,
++                                         dctbl, actbl);
++}
++
++GLOBAL(int)
++jsimd_can_encode_mcu_AC_first_prepare(void)
++{
++  init_simd();
++
++  if (DCTSIZE != 8)
++    return 0;
++  if (sizeof(JCOEF) != 2)
++    return 0;
++  if ((simd_support & JSIMD_SSE2) && simd_huffman)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(void)
++jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
++                                  const int *jpeg_natural_order_start, int Sl,
++                                  int Al, UJCOEF *values, size_t *zerobits)
++{
++  jsimd_encode_mcu_AC_first_prepare_e2k(block, jpeg_natural_order_start,
++                                        Sl, Al, (JCOEF*)values, zerobits);
++}
++
++GLOBAL(int)
++jsimd_can_encode_mcu_AC_refine_prepare(void)
++{
++  init_simd();
++
++  if (DCTSIZE != 8)
++    return 0;
++  if (sizeof(JCOEF) != 2)
++    return 0;
++  if ((simd_support & JSIMD_SSE2) && simd_huffman)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(int)
++jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
++                                   const int *jpeg_natural_order_start, int Sl,
++                                   int Al, UJCOEF *absvalues, size_t *bits)
++{
++  return jsimd_encode_mcu_AC_refine_prepare_e2k(block,
++                                                jpeg_natural_order_start,
++                                                Sl, Al, (JCOEF*)absvalues, bits);
++}
+diff --git a/simd/e2k/jsimd_api_e2k.h b/simd/e2k/jsimd_api_e2k.h
+new file mode 100644
+index 0000000..d857203
+--- /dev/null
++++ b/simd/e2k/jsimd_api_e2k.h
+@@ -0,0 +1,94 @@
++/*
++ * Elbrus optimizations for libjpeg-turbo
++ *
++ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
++ *
++ * This software is provided 'as-is', without any express or implied
++ * warranty.  In no event will the authors be held liable for any damages
++ * arising from the use of this software.
++ *
++ * Permission is granted to anyone to use this software for any purpose,
++ * including commercial applications, and to alter it and redistribute it
++ * freely, subject to the following restrictions:
++ *
++ * 1. The origin of this software must not be misrepresented; you must not
++ *    claim that you wrote the original software. If you use this software
++ *    in a product, an acknowledgment in the product documentation would be
++ *    appreciated but is not required.
++ * 2. Altered source versions must be plainly marked as such, and must not be
++ *    misrepresented as being the original software.
++ * 3. This notice may not be removed or altered from any source distribution.
++ */
++
++/* Function declarations */
++
++#define CONVERT_DECL(n) \
++EXTERN(void) jsimd_rgb##n##_ycc_convert_e2k(JDIMENSION img_width, \
++    JSAMPARRAY input_buf, JSAMPIMAGE output_buf, \
++    JDIMENSION output_row, int num_rows, int shuf_idx); \
++EXTERN(void) jsimd_rgb##n##_gray_convert_e2k(JDIMENSION img_width, \
++    JSAMPARRAY input_buf, JSAMPIMAGE output_buf, \
++    JDIMENSION output_row, int num_rows, int shuf_idx); \
++EXTERN(void) jsimd_ycc_rgb##n##_convert_e2k(JDIMENSION out_width, \
++    JSAMPIMAGE input_buf, JDIMENSION input_row, \
++    JSAMPARRAY output_buf, int num_rows, int shuf_idx); \
++EXTERN(void) jsimd_ycc_rgb##n##_convert_e2k(JDIMENSION out_width, \
++    JSAMPIMAGE input_buf, JDIMENSION input_row, JSAMPARRAY output_buf, \
++    int num_rows, int shuf_idx); \
++EXTERN(void) jsimd_ycc_rgb##n##_merged_upsample_e2k(JDIMENSION out_width, \
++    JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, \
++    JDIMENSION in_row_group_ctr_y, JSAMPARRAY output_buf, int shuf_idx); \
++
++CONVERT_DECL(3)
++CONVERT_DECL(4)
++
++EXTERN(void) jsimd_h2v1_downsample_e2k
++  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
++   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
++EXTERN(void) jsimd_h2v2_downsample_e2k
++  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
++   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
++
++#define UPSAMPLE_DECL(name) \
++EXTERN(void) jsimd_##name##_upsample_e2k \
++  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, \
++   JSAMPARRAY *output_data_ptr);
++
++UPSAMPLE_DECL(h2v1)
++UPSAMPLE_DECL(h2v2)
++UPSAMPLE_DECL(h2v1_fancy)
++UPSAMPLE_DECL(h2v2_fancy)
++
++EXTERN(void) jsimd_convsamp_e2k
++  (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
++EXTERN(void) jsimd_convsamp_float_e2k
++  (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
++
++EXTERN(void) jsimd_fdct_islow_e2k(DCTELEM *data);
++EXTERN(void) jsimd_fdct_ifast_e2k(DCTELEM *data);
++EXTERN(void) jsimd_fdct_float_e2k(FAST_FLOAT *data);
++EXTERN(void) jsimd_quantize_e2k
++  (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
++EXTERN(void) jsimd_quantize_float_e2k
++  (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
++EXTERN(void) jsimd_idct_islow_e2k
++  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
++   JDIMENSION output_col);
++EXTERN(void) jsimd_idct_ifast_e2k
++  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
++   JDIMENSION output_col);
++EXTERN(void) jsimd_idct_float_e2k
++  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
++   JDIMENSION output_col);
++
++EXTERN(JOCTET *) jsimd_huff_encode_one_block_e2k
++  (void *state, JOCTET *buffer, JCOEFPTR block,
++   int last_dc_val, c_derived_tbl *dctbl, c_derived_tbl *actbl);
++
++EXTERN(void) jsimd_encode_mcu_AC_first_prepare_e2k
++  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
++   JCOEF *values, size_t *zerobits);
++
++EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_e2k
++  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
++   JCOEF *absvalues, size_t *bits);
+diff --git a/simd/e2k/jsimd_e2k.h b/simd/e2k/jsimd_e2k.h
+new file mode 100644
+index 0000000..15d6262
+--- /dev/null
++++ b/simd/e2k/jsimd_e2k.h
+@@ -0,0 +1,207 @@
++/*
++ * Elbrus optimizations for libjpeg-turbo
++ *
++ * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
++ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
++ *
++ * This software is provided 'as-is', without any express or implied
++ * warranty.  In no event will the authors be held liable for any damages
++ * arising from the use of this software.
++ *
++ * Permission is granted to anyone to use this software for any purpose,
++ * including commercial applications, and to alter it and redistribute it
++ * freely, subject to the following restrictions:
++ *
++ * 1. The origin of this software must not be misrepresented; you must not
++ *    claim that you wrote the original software. If you use this software
++ *    in a product, an acknowledgment in the product documentation would be
++ *    appreciated but is not required.
++ * 2. Altered source versions must be plainly marked as such, and must not be
++ *    misrepresented as being the original software.
++ * 3. This notice may not be removed or altered from any source distribution.
++ */
++
++#define JPEG_INTERNALS
++#include "../../jinclude.h"
++#include "../../jpeglib.h"
++#include "../../jsimd.h"
++#include "../../jdct.h"
++#include "../../jsimddct.h"
++#include "../jsimd.h"
++#include "jsimd_api_e2k.h"
++#include <stdint.h>
++#include <smmintrin.h> /* SSE4.1 */
++
++
++/* Common code */
++
++#define __4X2(a, b)  a, b, a, b, a, b, a, b
++#define ALWAYS_INLINE __attribute__((__always_inline__)) inline
++
++#ifdef __e2k__
++#define PRAGMA_E2K _Pragma
++#define _mm_shuffle2_pi8(a, b, c) \
++    ((__m64)__builtin_e2k_pshufb((uint64_t)(b), (uint64_t)(a), (uint64_t)(c)))
++#define _mm_shuffle2_epi8(a, b, c) \
++    ((__m128i)__builtin_e2k_qppermb((__v2di)(b), (__v2di)(a), (__v2di)(c)))
++#define _mm_blendv_pi8(a, b, c) \
++    ((__m64)__builtin_e2k_pmerge((uint64_t)(a), (uint64_t)(b), (uint64_t)(c)))
++#else
++#define PRAGMA_E2K(x)
++#define _mm_shuffle2_pi8(a, b, c) \
++    _mm_movepi64_pi64(_mm_shuffle_epi8(_mm_unpacklo_epi64( \
++        _mm_movpi64_epi64(a), _mm_movpi64_epi64(b)), _mm_movpi64_epi64(c)))
++#define _mm_shuffle2_epi8(a, b, c) \
++    _mm_blendv_epi8(_mm_shuffle_epi8(a, c), _mm_shuffle_epi8(b, c), \
++        _mm_slli_epi16(c, 3))
++#define _mm_blendv_pi8(a, b, c) \
++    _mm_movepi64_pi64(_mm_blendv_epi8(_mm_movpi64_epi64(a), \
++        _mm_movpi64_epi64(b), _mm_movpi64_epi64(c)))
++
++#define BITREV_ROUND(c, i) a = (a & c) << i | (a >> i & c);
++static ALWAYS_INLINE uint64_t __builtin_e2k_bitrevd(uint64_t a) {
++    BITREV_ROUND(0x5555555555555555ll, 1)
++    BITREV_ROUND(0x3333333333333333ll, 2)
++    BITREV_ROUND(0x0F0F0F0F0F0F0F0Fll, 4)
++    BITREV_ROUND(0x00FF00FF00FF00FFll, 8)
++    BITREV_ROUND(0x0000FFFF0000FFFFll, 16)
++    return a >> 32 | a << 32;
++}
++
++static ALWAYS_INLINE uint64_t __builtin_e2k_insfd(uint64_t a, uint64_t b, uint64_t c) {
++  int n = b & 63;
++  a = a >> n | a << (64 - n);
++  return c ^ ((a ^ c) & (~0ll << (b >> 6 & 63)));
++}
++#endif
++
++#if defined(__iset__) && __iset__ >= 5
++static ALWAYS_INLINE __m128i _mm_packhi_epi32(__m128i a, __m128i b) {
++  __m128i index = _mm_setr_epi8(
++    2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31);
++  return _mm_shuffle2_epi8(a, b, index);
++}
++
++#define VEC_ISZERO(a) !_mm_cvtsi128_si64(_mm_packs_epi16(a, a))
++#else
++static ALWAYS_INLINE __m128i _mm_packhi_epi32(__m128i a, __m128i b) {
++  union { __m128i v; __m64 d[2]; } l = { a }, h = { b }, x;
++  __m64 index = _mm_setr_pi8(2, 3, 6, 7, 10, 11, 14, 15);
++  x.d[0] = _mm_shuffle2_pi8(l.d[0], l.d[1], index);
++  x.d[1] = _mm_shuffle2_pi8(h.d[0], h.d[1], index);
++  return x.v;
++}
++
++static ALWAYS_INLINE uint64_t vec_isnonzero(__m128i a) {
++  __v2di x = (__v2di)a;
++  return x[0] | x[1];
++}
++
++#define VEC_ISZERO(a) !vec_isnonzero(a)
++#endif
++
++#define VEC_ALIGNR8(a, b) _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b), _mm_castsi128_pd(a), 1))
++
++#define TRANSPOSE_FLOAT(a, b, c, d, e, f, g, h) \
++  tmp0 = _mm_unpacklo_ps(a, b); \
++  tmp1 = _mm_unpackhi_ps(a, b); \
++  tmp2 = _mm_unpacklo_ps(c, d); \
++  tmp3 = _mm_unpackhi_ps(c, d); \
++  e = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); \
++  f = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); \
++  g = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); \
++  h = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
++
++#define TRANSPOSE8(a, b) \
++  b##0 = _mm_unpacklo_epi8(a##0, a##2); \
++  b##1 = _mm_unpackhi_epi8(a##0, a##2); \
++  b##2 = _mm_unpacklo_epi8(a##1, a##3); \
++  b##3 = _mm_unpackhi_epi8(a##1, a##3);
++
++#define TRANSPOSE16(a, b) \
++  b##0 = _mm_unpacklo_epi16(a##0, a##2); \
++  b##1 = _mm_unpackhi_epi16(a##0, a##2); \
++  b##2 = _mm_unpacklo_epi16(a##1, a##3); \
++  b##3 = _mm_unpackhi_epi16(a##1, a##3); \
++  b##4 = _mm_unpacklo_epi16(a##4, a##6); \
++  b##5 = _mm_unpackhi_epi16(a##4, a##6); \
++  b##6 = _mm_unpacklo_epi16(a##5, a##7); \
++  b##7 = _mm_unpackhi_epi16(a##5, a##7);
++
++#define TRANSPOSE(a, b) \
++  TRANSPOSE16(a, b) TRANSPOSE16(b, a) \
++  b##0 = _mm_unpacklo_epi64(a##0, a##4); \
++  b##1 = _mm_unpackhi_epi64(a##0, a##4); \
++  b##2 = _mm_unpacklo_epi64(a##1, a##5); \
++  b##3 = _mm_unpackhi_epi64(a##1, a##5); \
++  b##4 = _mm_unpacklo_epi64(a##2, a##6); \
++  b##5 = _mm_unpackhi_epi64(a##2, a##6); \
++  b##6 = _mm_unpacklo_epi64(a##3, a##7); \
++  b##7 = _mm_unpackhi_epi64(a##3, a##7);
++
++#define IDCT_SAVE() { \
++  __m128i pb_cj = _mm_set1_epi8((int8_t)CENTERJSAMPLE); \
++  \
++  row0 = _mm_xor_si128(_mm_packs_epi16(out0, out1), pb_cj); \
++  row1 = _mm_xor_si128(_mm_packs_epi16(out2, out3), pb_cj); \
++  row2 = _mm_xor_si128(_mm_packs_epi16(out4, out5), pb_cj); \
++  row3 = _mm_xor_si128(_mm_packs_epi16(out6, out7), pb_cj); \
++  \
++  TRANSPOSE8(row, col) TRANSPOSE8(col, row) TRANSPOSE8(row, col) \
++  \
++  VEC_STL(output_buf[0] + output_col, col0); \
++  VEC_STH(output_buf[1] + output_col, col0); \
++  VEC_STL(output_buf[2] + output_col, col1); \
++  VEC_STH(output_buf[3] + output_col, col1); \
++  VEC_STL(output_buf[4] + output_col, col2); \
++  VEC_STH(output_buf[5] + output_col, col2); \
++  VEC_STL(output_buf[6] + output_col, col3); \
++  VEC_STH(output_buf[7] + output_col, col3); \
++}
++
++#define IDCT_SPLAT8(col0) { \
++  row3 = _mm_unpacklo_epi16(col0, col0); \
++  row7 = _mm_unpackhi_epi16(col0, col0); \
++  row1 = _mm_unpacklo_epi16(row3, row3); \
++  row3 = _mm_unpackhi_epi16(row3, row3); \
++  row5 = _mm_unpacklo_epi16(row7, row7); \
++  row7 = _mm_unpackhi_epi16(row7, row7); \
++  row0 = _mm_unpacklo_epi64(row1, row1); \
++  row1 = _mm_unpackhi_epi64(row1, row1); \
++  row2 = _mm_unpacklo_epi64(row3, row3); \
++  row3 = _mm_unpackhi_epi64(row3, row3); \
++  row4 = _mm_unpacklo_epi64(row5, row5); \
++  row5 = _mm_unpackhi_epi64(row5, row5); \
++  row6 = _mm_unpacklo_epi64(row7, row7); \
++  row7 = _mm_unpackhi_epi64(row7, row7); \
++}
++
++#ifndef min
++#define min(a, b)  ((a) < (b) ? (a) : (b))
++#endif
++
++#define VEC_LD(a)     _mm_loadu_si128((const __m128i*)(a))
++#define VEC_ST(a, b)  _mm_storeu_si128((__m128i*)(a), b)
++#define VEC_LD8(a)    _mm_loadl_epi64((const __m128i*)(a))
++#define VEC_STL(a, b) _mm_storel_epi64((__m128i*)(a), b)
++#define VEC_STH(a, b) _mm_storeh_pd((double*)(a), _mm_castsi128_pd(b));
++#define VEC_SPLAT(v, i) _mm_shuffle_epi8(v, _mm_set1_epi16((i) * 2 | ((i) * 2 + 1) << 8))
++
++#if !defined(__iset__) || __iset__ < 5
++#define NEED_ALIGN8
++#define ALIGN8_COMMON uint64_t src_shr; __m64 src_tmp0, src_tmp1;
++#define ALIGN8_VARS(src) __m64 *src##_ptr, src##_next, src##_index;
++#define ALIGN8_START(ptr, src) \
++  src_shr = (intptr_t)(ptr - 1) & 7; \
++  src##_ptr = (__m64*)((intptr_t)(ptr - 1) & -8); \
++  src##_next = src##_ptr[src_shr == 7]; \
++  src##_index = _mm_add_pi8(_mm_set1_pi8(src_shr), \
++                            _mm_setr_pi8(1, 2, 3, 4, 5, 6, 7, 8));
++#define ALIGN8_READ16(v0, src, i) \
++  src_tmp1 = src##_ptr[i * 2 + 1]; \
++  src_tmp0 = _mm_shuffle2_pi8(src##_next, src_tmp1, src##_index); \
++  src##_next = src##_ptr[i * 2 + 2]; \
++  src_tmp1 = _mm_shuffle2_pi8(src_tmp1, src##_next, src##_index); \
++  v0 = _mm_setr_epi64(src_tmp0, src_tmp1);
++#endif
++
+-- 
+2.34.1
+
diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec
index 6497ff7..751c31c 100644
--- a/libjpeg-turbo.spec
+++ b/libjpeg-turbo.spec
@@ -25,6 +25,7 @@ Source0:	https://sourceforge.net/projects/libjpeg-turbo/files/%{version}/%{name}
 Source2:	http://jpegclub.org/jpegexiforient.c
 Source3:	http://jpegclub.org/exifautotran.txt
 Patch0:		jpeg-6b-c++fixes.patch
+Patch1:		libjpeg-turbo-3.0.2-e2k.patch
 BuildRequires:	cmake
 BuildRequires:	libtool >= 1.4
 %ifarch %{ix86} x86_64
@@ -159,6 +160,7 @@ have orientation markings in the EXIF data.
 %prep
 %setup -q
 %patch0 -p0
+%patch1 -p1
 
 cp %{SOURCE2} jpegexiforient.c
 cp %{SOURCE3} exifautotran
@@ -174,7 +176,7 @@ pushd jpeg8
 CFLAGS="%{optflags} -Ofast -funroll-loops" \
 %cmake ../.. -DWITH_JPEG8="True"
 
-%make -s
+%make_build -s
 popd
 
 # Build jpeg v6.2 API
@@ -184,7 +186,7 @@ CFLAGS="%{optflags} -Ofast -funroll-loops" \
 	-DWITH_ARITH_DEC="True" \
 	-DWITH_ARITH_ENC="True"
 
-%make -s
+%make_build -s
 popd
 
 # Build jpegexiforient binary
@@ -192,8 +194,8 @@ popd
 
 
 %install
-%makeinstall_std -C jpeg8/build
-%makeinstall_std -C jpeg62/build
+%make_install -C jpeg8/build
+%make_install -C jpeg62/build
 
 install -m755 jpegexiforient -D %{buildroot}%{_bindir}/jpegexiforient
 install -m755 exifautotran -D %{buildroot}%{_bindir}/exifautotran

From 2c029484f08f9ab4e05d9bac1435e98abf06da31 Mon Sep 17 00:00:00 2001
From: Alexander Stefanov <alexander@mezon.ru>
Date: Sat, 18 May 2024 12:13:00 +0000
Subject: [PATCH 28/32] 3.0.3

---
 .abf.yml                      |    2 +-
 libjpeg-turbo-3.0.2-e2k.patch | 4852 +--------------------------------
 libjpeg-turbo.spec            |    4 +-
 3 files changed, 9 insertions(+), 4849 deletions(-)

diff --git a/.abf.yml b/.abf.yml
index a52b704..3b105b8 100644
--- a/.abf.yml
+++ b/.abf.yml
@@ -1,2 +1,2 @@
 sources:
-  libjpeg-turbo-3.0.2.tar.gz: b6c5d5081ced8502eb1e1e72f1f5cc2856ce90ee
+  3.0.3.tar.gz: 397a31222105129c9e798efce2459c445048546e
diff --git a/libjpeg-turbo-3.0.2-e2k.patch b/libjpeg-turbo-3.0.2-e2k.patch
index a80f1d3..888624d 100644
--- a/libjpeg-turbo-3.0.2-e2k.patch
+++ b/libjpeg-turbo-3.0.2-e2k.patch
@@ -1,60 +1,8 @@
-From 5c6ff06bc9aec237e1ba222a3dde057cbfa81c9d Mon Sep 17 00:00:00 2001
-From: Ilya Kurdyukov <jpegqs@gmail.com>
-Date: Fri, 9 Feb 2024 09:57:55 +0700
-Subject: [PATCH] libjpeg-turbo-3.0.2 e2k support
-
----
- CMakeLists.txt           |   5 +
- simd/CMakeLists.txt      |  23 ++
- simd/e2k/jccolext-e2k.c  | 213 +++++++++++
- simd/e2k/jccolor-e2k.c   | 163 +++++++++
- simd/e2k/jchuff-e2k.c    | 307 ++++++++++++++++
- simd/e2k/jcphuff-e2k.c   | 145 ++++++++
- simd/e2k/jcsample-e2k.c  | 203 +++++++++++
- simd/e2k/jcsample.h      |  28 ++
- simd/e2k/jdcolext-e2k.c  | 258 +++++++++++++
- simd/e2k/jdcolor-e2k.c   | 289 +++++++++++++++
- simd/e2k/jdcoltab-e2k.c  |  80 ++++
- simd/e2k/jdsample-e2k.c  | 389 ++++++++++++++++++++
- simd/e2k/jfdctflt-e2k.c  | 127 +++++++
- simd/e2k/jfdctfst-e2k.c  | 145 ++++++++
- simd/e2k/jfdctint-e2k.c  | 255 +++++++++++++
- simd/e2k/jidctflt-e2k.c  | 215 +++++++++++
- simd/e2k/jidctfst-e2k.c  | 187 ++++++++++
- simd/e2k/jidctint-e2k.c  | 294 +++++++++++++++
- simd/e2k/jquantf-e2k.c   | 121 +++++++
- simd/e2k/jquanti-e2k.c   | 178 +++++++++
- simd/e2k/jsimd.c         | 761 +++++++++++++++++++++++++++++++++++++++
- simd/e2k/jsimd_api_e2k.h |  94 +++++
- simd/e2k/jsimd_e2k.h     | 207 +++++++++++
- 23 files changed, 4687 insertions(+)
- create mode 100644 simd/e2k/jccolext-e2k.c
- create mode 100644 simd/e2k/jccolor-e2k.c
- create mode 100644 simd/e2k/jchuff-e2k.c
- create mode 100644 simd/e2k/jcphuff-e2k.c
- create mode 100644 simd/e2k/jcsample-e2k.c
- create mode 100644 simd/e2k/jcsample.h
- create mode 100644 simd/e2k/jdcolext-e2k.c
- create mode 100644 simd/e2k/jdcolor-e2k.c
- create mode 100644 simd/e2k/jdcoltab-e2k.c
- create mode 100644 simd/e2k/jdsample-e2k.c
- create mode 100644 simd/e2k/jfdctflt-e2k.c
- create mode 100644 simd/e2k/jfdctfst-e2k.c
- create mode 100644 simd/e2k/jfdctint-e2k.c
- create mode 100644 simd/e2k/jidctflt-e2k.c
- create mode 100644 simd/e2k/jidctfst-e2k.c
- create mode 100644 simd/e2k/jidctint-e2k.c
- create mode 100644 simd/e2k/jquantf-e2k.c
- create mode 100644 simd/e2k/jquanti-e2k.c
- create mode 100644 simd/e2k/jsimd.c
- create mode 100644 simd/e2k/jsimd_api_e2k.h
- create mode 100644 simd/e2k/jsimd_e2k.h
-
 diff --git a/CMakeLists.txt b/CMakeLists.txt
-index adb0ca4..3b445a0 100644
+index ff9c9c2..fa0364c 100644
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
-@@ -125,6 +125,9 @@ elseif(CMAKE_SYSTEM_PROCESSOR_LC STREQUAL "aarch64" OR
+@@ -120,6 +120,9 @@ elseif(CMAKE_SYSTEM_PROCESSOR_LC STREQUAL "aarch64" OR
  elseif(CMAKE_SYSTEM_PROCESSOR_LC MATCHES "^ppc" OR
    CMAKE_SYSTEM_PROCESSOR_LC MATCHES "^powerpc")
    set(CPU_TYPE powerpc)
@@ -64,20 +12,20 @@ index adb0ca4..3b445a0 100644
  else()
    set(CPU_TYPE ${CMAKE_SYSTEM_PROCESSOR_LC})
  endif()
-@@ -906,6 +909,8 @@ if(CPU_TYPE STREQUAL "x86_64" OR CPU_TYPE STREQUAL "i386")
+@@ -926,6 +929,8 @@ if(CPU_TYPE STREQUAL "x86_64" OR CPU_TYPE STREQUAL "i386")
    elseif(CPU_TYPE STREQUAL "x86_64")
      set(DEFAULT_FLOATTEST8 no-fp-contract)
    endif()
 +elseif(WITH_SIMD AND CPU_TYPE STREQUAL "e2k")
 +  set(DEFAULT_FLOATTEST8 sse)
  elseif(CPU_TYPE STREQUAL "powerpc" OR CPU_TYPE STREQUAL "arm64")
-   if(CMAKE_C_COMPILER_ID STREQUAL "Clang")
+   if(CMAKE_C_COMPILER_ID MATCHES "Clang")
      if(CMAKE_C_COMPILER_VERSION VERSION_EQUAL 14.0.0 OR
 diff --git a/simd/CMakeLists.txt b/simd/CMakeLists.txt
-index 6024900..789fa3f 100644
+index 0237955..e61baea 100644
 --- a/simd/CMakeLists.txt
 +++ b/simd/CMakeLists.txt
-@@ -531,6 +531,29 @@ if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED)
+@@ -543,6 +543,29 @@ if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED)
    set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1)
  endif()
  
@@ -107,4791 +55,3 @@ index 6024900..789fa3f 100644
  
  ###############################################################################
  # None
-diff --git a/simd/e2k/jccolext-e2k.c b/simd/e2k/jccolext-e2k.c
-new file mode 100644
-index 0000000..49abdb4
---- /dev/null
-+++ b/simd/e2k/jccolext-e2k.c
-@@ -0,0 +1,213 @@
-+/*
-+ * Elbrus optimizations for libjpeg-turbo
-+ *
-+ * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
-+ * Copyright (C) 2014, Jay Foad.  All Rights Reserved.
-+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
-+ *
-+ * This software is provided 'as-is', without any express or implied
-+ * warranty.  In no event will the authors be held liable for any damages
-+ * arising from the use of this software.
-+ *
-+ * Permission is granted to anyone to use this software for any purpose,
-+ * including commercial applications, and to alter it and redistribute it
-+ * freely, subject to the following restrictions:
-+ *
-+ * 1. The origin of this software must not be misrepresented; you must not
-+ *    claim that you wrote the original software. If you use this software
-+ *    in a product, an acknowledgment in the product documentation would be
-+ *    appreciated but is not required.
-+ * 2. Altered source versions must be plainly marked as such, and must not be
-+ *    misrepresented as being the original software.
-+ * 3. This notice may not be removed or altered from any source distribution.
-+ */
-+
-+/* This file is included by jccolor-e2k.c */
-+
-+void rgbn_ycc_convert(JDIMENSION img_width, JSAMPARRAY input_buf,
-+                      JSAMPIMAGE output_buf, JDIMENSION output_row,
-+                      int num_rows, int shuf_idx)
-+{
-+  JSAMPROW inptr, outptr0, outptr1, outptr2;
-+  unsigned char __attribute__((aligned(16))) tmpbuf[PIXELSIZE * 16];
-+
-+  __m128i pb_zero = _mm_setzero_si128();
-+  __m128i pb_shuf0 = VEC_LD(rgb_ycc_shuf_const[shuf_idx]);
-+#if PIXELSIZE == 4
-+  __m128i rgb3 = pb_zero;
-+#else
-+  __m128i pb_shuf4 = VEC_LD(rgb_ycc_shuf_const[shuf_idx] + 16);
-+#endif
-+  __m128i rgb0, rgb1 = pb_zero, rgb2 = pb_zero,
-+    rgbg0, rgbg1, rgbg2, rgbg3, rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
-+  __m128i y, yl, yh, y0, y1, y2, y3;
-+  __m128i cb, cr, crl, crh, cbl, cbh;
-+  __m128i cr0, cr1, cr2, cr3, cb0, cb1, cb2, cb3;
-+
-+  /* Constants */
-+  __m128i pw_f0299_f0337 = _mm_setr_epi16(__4X2(F_0_299, F_0_337)),
-+    pw_f0114_f0250 = _mm_setr_epi16(__4X2(F_0_114, F_0_250)),
-+    pw_mf016_mf033 = _mm_setr_epi16(__4X2(-F_0_168, -F_0_331)),
-+    pw_mf008_mf041 = _mm_setr_epi16(__4X2(-F_0_081, -F_0_418)),
-+    pw_mf050_f000 = _mm_setr_epi16(__4X2(-F_0_500, 0)),
-+    pd_onehalf = _mm_set1_epi32(ONE_HALF),
-+    pd_onehalfm1_cj = _mm_set1_epi32(ONE_HALF - 1 + (CENTERJSAMPLE << SCALEBITS));
-+#ifdef NEED_ALIGN8
-+  ALIGN8_COMMON
-+  ALIGN8_VARS(src)
-+#endif
-+
-+  if (img_width > 0)
-+  while (--num_rows >= 0) {
-+    int num_cols;
-+    inptr = *input_buf++;
-+    outptr0 = output_buf[0][output_row];
-+    outptr1 = output_buf[1][output_row];
-+    outptr2 = output_buf[2][output_row];
-+    output_row++;
-+
-+    if (img_width >= 16) {
-+#ifdef NEED_ALIGN8
-+      ALIGN8_START(inptr, src)
-+      inptr += (img_width & -16) * PIXELSIZE;
-+#endif
-+
-+      PRAGMA_E2K("ivdep")
-+      for (num_cols = img_width; num_cols >= 16; num_cols -= 16,
-+           outptr0 += 16, outptr1 += 16, outptr2 += 16) {
-+#ifdef NEED_ALIGN8
-+        ALIGN8_READ16(rgb0, src, 0)
-+        ALIGN8_READ16(rgb1, src, 1)
-+        ALIGN8_READ16(rgb2, src, 2)
-+#if PIXELSIZE == 4
-+        ALIGN8_READ16(rgb3, src, 3)
-+#endif
-+        src_ptr += PIXELSIZE * 2;
-+#else
-+        rgb0 = VEC_LD(inptr);
-+        rgb1 = VEC_LD(inptr + 16);
-+        rgb2 = VEC_LD(inptr + 32);
-+#if PIXELSIZE == 4
-+        rgb3 = VEC_LD(inptr + 48);
-+#endif
-+        inptr += PIXELSIZE * 16;
-+#endif
-+        RGB_SHUFFLE
-+        CALC_Y(outptr0)
-+        CALC_CC(outptr1, outptr2)
-+      }
-+    }
-+
-+    num_cols = img_width & 15;
-+    if (num_cols) {
-+      int i;
-+      memcpy(tmpbuf, inptr, num_cols * PIXELSIZE);
-+      rgb0 = VEC_LD(tmpbuf);
-+      rgb1 = VEC_LD(tmpbuf + 16);
-+      rgb2 = VEC_LD(tmpbuf + 32);
-+#if PIXELSIZE == 4
-+      rgb3 = VEC_LD(tmpbuf + 48);
-+#endif
-+      RGB_SHUFFLE
-+      CALC_Y(tmpbuf)
-+      CALC_CC(tmpbuf + 16, tmpbuf + 32)
-+
-+      for (i = 0; i < num_cols; i++) {
-+        outptr0[i] = tmpbuf[i];
-+        outptr1[i] = tmpbuf[i + 16];
-+        outptr2[i] = tmpbuf[i + 32];
-+      }
-+    }
-+  }
-+}
-+
-+void rgbn_gray_convert(JDIMENSION img_width, JSAMPARRAY input_buf,
-+                       JSAMPIMAGE output_buf, JDIMENSION output_row,
-+                       int num_rows, int shuf_idx)
-+{
-+  JSAMPROW inptr, outptr;
-+  uint8_t __attribute__((aligned(16))) tmpbuf[PIXELSIZE * 16];
-+
-+  __m128i pb_zero = _mm_setzero_si128();
-+  __m128i pb_shuf0 = VEC_LD(rgb_ycc_shuf_const[shuf_idx]);
-+#if PIXELSIZE == 4
-+  __m128i rgb3 = pb_zero;
-+#else
-+  __m128i pb_shuf4 = VEC_LD(rgb_ycc_shuf_const[shuf_idx] + 16);
-+#endif
-+  __m128i rgb0, rgb1 = pb_zero, rgb2 = pb_zero,
-+    rgbg0, rgbg1, rgbg2, rgbg3, rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
-+  __m128i y, yl, yh, y0, y1, y2, y3;
-+
-+  /* Constants */
-+  __m128i pw_f0299_f0337 = _mm_setr_epi16(__4X2(F_0_299, F_0_337)),
-+    pw_f0114_f0250 = _mm_setr_epi16(__4X2(F_0_114, F_0_250)),
-+    pd_onehalf = _mm_set1_epi32(ONE_HALF);
-+#ifdef NEED_ALIGN8
-+  ALIGN8_COMMON
-+  ALIGN8_VARS(src)
-+#endif
-+
-+  if (img_width > 0)
-+  while (--num_rows >= 0) {
-+    int num_cols;
-+    inptr = *input_buf++;
-+    outptr = output_buf[0][output_row];
-+    output_row++;
-+
-+    if (img_width >= 16) {
-+#ifdef NEED_ALIGN8
-+      ALIGN8_START(inptr, src)
-+      inptr += (img_width & -16) * PIXELSIZE;
-+#endif
-+
-+      PRAGMA_E2K("ivdep")
-+      for (num_cols = img_width; num_cols >= 16; num_cols -= 16,
-+           outptr += 16) {
-+#ifdef NEED_ALIGN8
-+        ALIGN8_READ16(rgb0, src, 0)
-+        ALIGN8_READ16(rgb1, src, 1)
-+        ALIGN8_READ16(rgb2, src, 2)
-+#if PIXELSIZE == 4
-+        ALIGN8_READ16(rgb3, src, 3)
-+#endif
-+        src_ptr += PIXELSIZE * 2;
-+#else
-+        rgb0 = VEC_LD(inptr);
-+        rgb1 = VEC_LD(inptr + 16);
-+        rgb2 = VEC_LD(inptr + 32);
-+#if PIXELSIZE == 4
-+        rgb3 = VEC_LD(inptr + 48);
-+#endif
-+        inptr += PIXELSIZE * 16;
-+#endif
-+        RGB_SHUFFLE
-+        CALC_Y(outptr)
-+      }
-+    }
-+
-+    num_cols = img_width & 15;
-+    if (num_cols) {
-+      int i;
-+      memcpy(tmpbuf, inptr, num_cols * PIXELSIZE);
-+      rgb0 = VEC_LD(tmpbuf);
-+      rgb1 = VEC_LD(tmpbuf + 16);
-+      rgb2 = VEC_LD(tmpbuf + 32);
-+#if PIXELSIZE == 4
-+      rgb3 = VEC_LD(tmpbuf + 48);
-+#endif
-+      RGB_SHUFFLE
-+      CALC_Y(tmpbuf)
-+
-+      for (i = 0; i < num_cols; i++) {
-+        outptr[i] = tmpbuf[i];
-+      }
-+    }
-+  }
-+}
-+
-+#undef RGB_SHUFFLE
-+#undef PIXELSIZE
-+#undef rgbn_ycc_convert
-+#undef rgbn_gray_convert
-+
-diff --git a/simd/e2k/jccolor-e2k.c b/simd/e2k/jccolor-e2k.c
-new file mode 100644
-index 0000000..0af2626
---- /dev/null
-+++ b/simd/e2k/jccolor-e2k.c
-@@ -0,0 +1,163 @@
-+/*
-+ * Elbrus optimizations for libjpeg-turbo
-+ *
-+ * Copyright (C) 2014, D. R. Commander.  All Rights Reserved.
-+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
-+ *
-+ * This software is provided 'as-is', without any express or implied
-+ * warranty.  In no event will the authors be held liable for any damages
-+ * arising from the use of this software.
-+ *
-+ * Permission is granted to anyone to use this software for any purpose,
-+ * including commercial applications, and to alter it and redistribute it
-+ * freely, subject to the following restrictions:
-+ *
-+ * 1. The origin of this software must not be misrepresented; you must not
-+ *    claim that you wrote the original software. If you use this software
-+ *    in a product, an acknowledgment in the product documentation would be
-+ *    appreciated but is not required.
-+ * 2. Altered source versions must be plainly marked as such, and must not be
-+ *    misrepresented as being the original software.
-+ * 3. This notice may not be removed or altered from any source distribution.
-+ */
-+
-+/* RGB --> YCC CONVERSION */
-+
-+#include "jsimd_e2k.h"
-+
-+#define F_0_081  5329                 /* FIX(0.08131) */
-+#define F_0_114  7471                 /* FIX(0.11400) */
-+#define F_0_168  11059                /* FIX(0.16874) */
-+#define F_0_250  16384                /* FIX(0.25000) */
-+#define F_0_299  19595                /* FIX(0.29900) */
-+#define F_0_331  21709                /* FIX(0.33126) */
-+#define F_0_418  27439                /* FIX(0.41869) */
-+#define F_0_500  32768                /* FIX(0.50000) */
-+#define F_0_587  38470                /* FIX(0.58700) */
-+#define F_0_337  (F_0_587 - F_0_250)  /* FIX(0.58700) - FIX(0.25000) */
-+#define F_0_413  (65536 - F_0_587)    /* FIX(1.00000) - FIX(0.58700) */
-+
-+#define SCALEBITS  16
-+#define ONE_HALF  (1 << (SCALEBITS - 1))
-+
-+#define RGBG_INDEX_(name, color, i, x) \
-+  name##_##color + i * name##_PIXELSIZE + x, \
-+  name##_GREEN + i * name##_PIXELSIZE + x
-+#define RGBG_INDEX(name, x) \
-+  RGBG_INDEX_(name, RED, 0, x), RGBG_INDEX_(name, RED, 1, x), \
-+  RGBG_INDEX_(name, RED, 2, x), RGBG_INDEX_(name, RED, 3, x), \
-+  RGBG_INDEX_(name, BLUE, 0, x), RGBG_INDEX_(name, BLUE, 1, x), \
-+  RGBG_INDEX_(name, BLUE, 2, x), RGBG_INDEX_(name, BLUE, 3, x)
-+
-+static const uint8_t __attribute__((aligned(16)))
-+rgb_ycc_shuf_const[7][32] = {
-+  { RGBG_INDEX(RGB, 0), RGBG_INDEX(RGB, 4) },
-+  { RGBG_INDEX(EXT_RGB, 0), RGBG_INDEX(EXT_RGB, 4) },
-+  { RGBG_INDEX(EXT_RGBX, 0), RGBG_INDEX(EXT_RGBX, 4) },
-+  { RGBG_INDEX(EXT_BGR, 0), RGBG_INDEX(EXT_BGR, 4) },
-+  { RGBG_INDEX(EXT_BGRX, 0), RGBG_INDEX(EXT_BGRX, 4) },
-+  { RGBG_INDEX(EXT_XBGR, 0), RGBG_INDEX(EXT_XBGR, 4) },
-+  { RGBG_INDEX(EXT_XRGB, 0), RGBG_INDEX(EXT_XRGB, 4) }
-+};
-+
-+  /* rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3 
-+   * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
-+   * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
-+   * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
-+   *
-+   * rg0 = R0 G0 R1 G1 R2 G2 R3 G3
-+   * bg0 = B0 G0 B1 G1 B2 G2 B3 G3
-+   * ...
-+   */
-+
-+  /* (Original)
-+   * Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
-+   * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
-+   * Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
-+   *
-+   * (This implementation)
-+   * Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
-+   * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
-+   * Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
-+   */
-+
-+#define CALC_Y(outptr0) \
-+  rg0 = _mm_unpacklo_epi8(rgbg0, pb_zero); \
-+  bg0 = _mm_unpackhi_epi8(rgbg0, pb_zero); \
-+  rg1 = _mm_unpacklo_epi8(rgbg1, pb_zero); \
-+  bg1 = _mm_unpackhi_epi8(rgbg1, pb_zero); \
-+  rg2 = _mm_unpacklo_epi8(rgbg2, pb_zero); \
-+  bg2 = _mm_unpackhi_epi8(rgbg2, pb_zero); \
-+  rg3 = _mm_unpacklo_epi8(rgbg3, pb_zero); \
-+  bg3 = _mm_unpackhi_epi8(rgbg3, pb_zero); \
-+  \
-+  /* Calculate Y values */ \
-+  y0 = _mm_add_epi32(_mm_madd_epi16(rg0, pw_f0299_f0337), pd_onehalf); \
-+  y1 = _mm_add_epi32(_mm_madd_epi16(rg1, pw_f0299_f0337), pd_onehalf); \
-+  y2 = _mm_add_epi32(_mm_madd_epi16(rg2, pw_f0299_f0337), pd_onehalf); \
-+  y3 = _mm_add_epi32(_mm_madd_epi16(rg3, pw_f0299_f0337), pd_onehalf); \
-+  y0 = _mm_add_epi32(_mm_madd_epi16(bg0, pw_f0114_f0250), y0); \
-+  y1 = _mm_add_epi32(_mm_madd_epi16(bg1, pw_f0114_f0250), y1); \
-+  y2 = _mm_add_epi32(_mm_madd_epi16(bg2, pw_f0114_f0250), y2); \
-+  y3 = _mm_add_epi32(_mm_madd_epi16(bg3, pw_f0114_f0250), y3); \
-+  \
-+  yl = _mm_packhi_epi32(y0, y1); \
-+  yh = _mm_packhi_epi32(y2, y3); \
-+  y = _mm_packus_epi16(yl, yh); \
-+  VEC_ST(outptr0, y);
-+
-+#define CALC_CC(outptr1, outptr2) \
-+  /* Calculate Cb values */ \
-+  cb0 = _mm_add_epi32(_mm_madd_epi16(rg0, pw_mf016_mf033), pd_onehalfm1_cj); \
-+  cb1 = _mm_add_epi32(_mm_madd_epi16(rg1, pw_mf016_mf033), pd_onehalfm1_cj); \
-+  cb2 = _mm_add_epi32(_mm_madd_epi16(rg2, pw_mf016_mf033), pd_onehalfm1_cj); \
-+  cb3 = _mm_add_epi32(_mm_madd_epi16(rg3, pw_mf016_mf033), pd_onehalfm1_cj); \
-+  cb0 = _mm_sub_epi32(cb0, _mm_madd_epi16(bg0, pw_mf050_f000)); \
-+  cb1 = _mm_sub_epi32(cb1, _mm_madd_epi16(bg1, pw_mf050_f000)); \
-+  cb2 = _mm_sub_epi32(cb2, _mm_madd_epi16(bg2, pw_mf050_f000)); \
-+  cb3 = _mm_sub_epi32(cb3, _mm_madd_epi16(bg3, pw_mf050_f000)); \
-+  \
-+  cbl = _mm_packhi_epi32(cb0, cb1); \
-+  cbh = _mm_packhi_epi32(cb2, cb3); \
-+  cb = _mm_packus_epi16(cbl, cbh); \
-+  VEC_ST(outptr1, cb); \
-+  \
-+  /* Calculate Cr values */ \
-+  cr0 = _mm_add_epi32(_mm_madd_epi16(bg0, pw_mf008_mf041), pd_onehalfm1_cj); \
-+  cr1 = _mm_add_epi32(_mm_madd_epi16(bg1, pw_mf008_mf041), pd_onehalfm1_cj); \
-+  cr2 = _mm_add_epi32(_mm_madd_epi16(bg2, pw_mf008_mf041), pd_onehalfm1_cj); \
-+  cr3 = _mm_add_epi32(_mm_madd_epi16(bg3, pw_mf008_mf041), pd_onehalfm1_cj); \
-+  cr0 = _mm_sub_epi32(cr0, _mm_madd_epi16(rg0, pw_mf050_f000)); \
-+  cr1 = _mm_sub_epi32(cr1, _mm_madd_epi16(rg1, pw_mf050_f000)); \
-+  cr2 = _mm_sub_epi32(cr2, _mm_madd_epi16(rg2, pw_mf050_f000)); \
-+  cr3 = _mm_sub_epi32(cr3, _mm_madd_epi16(rg3, pw_mf050_f000)); \
-+  \
-+  crl = _mm_packhi_epi32(cr0, cr1); \
-+  crh = _mm_packhi_epi32(cr2, cr3); \
-+  cr = _mm_packus_epi16(crl, crh); \
-+  VEC_ST(outptr2, cr);
-+
-+
-+#define PIXELSIZE 3
-+#define RGB_SHUFFLE \
-+  rgbg0 = _mm_shuffle_epi8(rgb0, pb_shuf0); \
-+  rgbg1 = _mm_shuffle_epi8(VEC_ALIGNR8(rgb1, rgb0), pb_shuf4); \
-+  rgbg2 = _mm_shuffle_epi8(VEC_ALIGNR8(rgb2, rgb1), pb_shuf0); \
-+  rgbg3 = _mm_shuffle_epi8(rgb2, pb_shuf4);
-+
-+#define rgbn_ycc_convert  jsimd_rgb3_ycc_convert_e2k
-+#define rgbn_gray_convert jsimd_rgb3_gray_convert_e2k
-+#include "jccolext-e2k.c"
-+
-+
-+#define PIXELSIZE 4
-+#define RGB_SHUFFLE \
-+  rgbg0 = _mm_shuffle_epi8(rgb0, pb_shuf0); \
-+  rgbg1 = _mm_shuffle_epi8(rgb1, pb_shuf0); \
-+  rgbg2 = _mm_shuffle_epi8(rgb2, pb_shuf0); \
-+  rgbg3 = _mm_shuffle_epi8(rgb3, pb_shuf0);
-+
-+#define rgbn_ycc_convert  jsimd_rgb4_ycc_convert_e2k
-+#define rgbn_gray_convert jsimd_rgb4_gray_convert_e2k
-+#include "jccolext-e2k.c"
-+
-diff --git a/simd/e2k/jchuff-e2k.c b/simd/e2k/jchuff-e2k.c
-new file mode 100644
-index 0000000..ec4329e
---- /dev/null
-+++ b/simd/e2k/jchuff-e2k.c
-@@ -0,0 +1,307 @@
-+/*
-+ * Elbrus optimizations for libjpeg-turbo
-+ *
-+ * Copyright (C) 2022, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
-+ *
-+ * This software is provided 'as-is', without any express or implied
-+ * warranty.  In no event will the authors be held liable for any damages
-+ * arising from the use of this software.
-+ *
-+ * Permission is granted to anyone to use this software for any purpose,
-+ * including commercial applications, and to alter it and redistribute it
-+ * freely, subject to the following restrictions:
-+ *
-+ * 1. The origin of this software must not be misrepresented; you must not
-+ *    claim that you wrote the original software. If you use this software
-+ *    in a product, an acknowledgment in the product documentation would be
-+ *    appreciated but is not required.
-+ * 2. Altered source versions must be plainly marked as such, and must not be
-+ *    misrepresented as being the original software.
-+ * 3. This notice may not be removed or altered from any source distribution.
-+ *
-+ * NOTE: All referenced figures are from
-+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
-+ */
-+
-+/* Encode a single block's worth of coefficients */
-+
-+#include "jsimd_e2k.h"
-+
-+#if __SIZEOF_SIZE_T__ != 8
-+#error
-+#endif
-+
-+typedef unsigned long long bit_buf_type;
-+#define BIT_BUF_SIZE  64
-+
-+typedef struct {
-+  bit_buf_type put_buffer;              /* current bit accumulation buffer */
-+  int free_bits;                        /* # of bits available in it */
-+  int last_dc_val[MAX_COMPS_IN_SCAN];   /* last DC coef for each component */
-+} savable_state;
-+
-+typedef struct {
-+  JOCTET *next_output_byte;     /* => next byte to write in buffer */
-+  size_t free_in_buffer;        /* # of byte spaces remaining in buffer */
-+  savable_state cur;            /* Current bit buffer & DC state */
-+  j_compress_ptr cinfo;         /* dump_buffer needs access to this */
-+  int simd;
-+} working_state;
-+
-+#define EMIT_BYTE(b) { \
-+  buffer[0] = (JOCTET)(b); \
-+  buffer[1] = 0; \
-+  buffer -= -2 + ((JOCTET)(b) < 0xFF); \
-+}
-+
-+#define FLUSH() { \
-+  if (put_buffer & 0x8080808080808080 & ~(put_buffer + 0x0101010101010101)) { \
-+    EMIT_BYTE(put_buffer >> 56) \
-+    EMIT_BYTE(put_buffer >> 48) \
-+    EMIT_BYTE(put_buffer >> 40) \
-+    EMIT_BYTE(put_buffer >> 32) \
-+    EMIT_BYTE(put_buffer >> 24) \
-+    EMIT_BYTE(put_buffer >> 16) \
-+    EMIT_BYTE(put_buffer >>  8) \
-+    EMIT_BYTE(put_buffer      ) \
-+  } else { \
-+    *(uint64_t*)buffer = __builtin_bswap64(put_buffer); \
-+    buffer += 8; \
-+  } \
-+}
-+
-+#define PUT_AND_FLUSH(code, size) { \
-+  put_buffer = (put_buffer << (size + free_bits)) | (code >> -free_bits); \
-+  FLUSH() \
-+  free_bits += BIT_BUF_SIZE; \
-+  put_buffer = code; \
-+}
-+
-+#define PUT_BITS(code, size) { \
-+  free_bits -= size; \
-+  if (free_bits < 0) \
-+    PUT_AND_FLUSH(code, size) \
-+  else \
-+    put_buffer = (put_buffer << size) | code; \
-+}
-+
-+#define PUT_CODE(code, size) { \
-+  /* temp &= (((JLONG)1) << nbits) - 1; */ \
-+  /* temp |= code << nbits; */ \
-+  temp = __builtin_e2k_insfd(code, __builtin_e2k_insfd(nbits, 6 * 63 + 64, -nbits), temp); \
-+  nbits += size; \
-+  PUT_BITS(temp, nbits) \
-+}
-+
-+#define KLOOP_PREPARE(mask, i) \
-+  t0 = _mm_cmpeq_epi8(_mm_packs_epi16(v0, v1), zero); \
-+  t1 = _mm_cmpeq_epi8(_mm_packs_epi16(v2, v3), zero); \
-+  mask = (uint32_t)(_mm_movemask_epi8(t0) | _mm_movemask_epi8(t1) << 16); \
-+  t0 = _mm_add_epi16(_mm_srai_epi16(v0, 15), v0); \
-+  t1 = _mm_add_epi16(_mm_srai_epi16(v1, 15), v1); \
-+  t2 = _mm_add_epi16(_mm_srai_epi16(v2, 15), v2); \
-+  t3 = _mm_add_epi16(_mm_srai_epi16(v3, 15), v3); \
-+  v0 = _mm_abs_epi16(v0); \
-+  v1 = _mm_abs_epi16(v1); \
-+  v2 = _mm_abs_epi16(v2); \
-+  v3 = _mm_abs_epi16(v3); \
-+  VEC_ST(block_nbits + i,      v0); \
-+  VEC_ST(block_nbits + i +  8, v1); \
-+  VEC_ST(block_nbits + i + 16, v2); \
-+  VEC_ST(block_nbits + i + 24, v3); \
-+  VEC_ST(block_diff + i,      t0); \
-+  VEC_ST(block_diff + i +  8, t1); \
-+  VEC_ST(block_diff + i + 16, t2); \
-+  VEC_ST(block_diff + i + 24, t3);
-+
-+#define SHUF16X4(a, b, c, d) _mm_setr_pi8( \
-+  a * 2, a * 2 + 1, b * 2, b * 2 + 1, c * 2, c * 2 + 1, d * 2, d * 2 + 1)
-+#define VEC_COMBINE(h0, h1) _mm_unpacklo_epi64( \
-+  _mm_movpi64_epi64(h0), _mm_movpi64_epi64(h1))
-+#define INSFI_M64(a, b, c, d) _mm_cvtsi64_m64(__builtin_e2k_insfd( \
-+  _mm_cvtm64_si64(a), (b & 63) | (d & 63) << 6, _mm_cvtm64_si64(c)))
-+
-+GLOBAL(JOCTET *)
-+jsimd_huff_encode_one_block_e2k(void *state, JOCTET *buffer,
-+                                JCOEFPTR block, int last_dc_val,
-+                                c_derived_tbl *dctbl, c_derived_tbl *actbl) {
-+  uint64_t temp, nbits;
-+  uint64_t i, r, code, size;
-+  uint64_t code_0xf0 = actbl->ehufco[0xf0];
-+  uint64_t size_0xf0 = actbl->ehufsi[0xf0];
-+
-+  working_state *state_ptr = (working_state*)state;
-+  bit_buf_type put_buffer = state_ptr->cur.put_buffer;
-+  int64_t free_bits = state_ptr->cur.free_bits;
-+
-+  __m128i zero = _mm_setzero_si128();
-+  __m128i v0, v1, v2, v3, t0, t1, t2, t3;
-+  int64_t mask, mask1;
-+  uint16_t __attribute__((aligned(16))) block_nbits[DCTSIZE2];
-+  int16_t __attribute__((aligned(16))) block_diff[DCTSIZE2];
-+
-+#if 1 /* faster this way */
-+  {
-+    __m64 d0l, d0h, d1l, d1h, d2l, d2h, d3l, d3h;
-+    __m64 d4l, d4h, d5l, d5h, d6l, d6h, d7l, d7h;
-+    __m64 h0, h1, h2, h3, r0, r1, c1256 = SHUF16X4(1, 2, 5, 6);
-+
-+    d0l = *(__m64*)(block + 8 * 0); d0h = *(__m64*)(block + 8 * 0 + 4); //  0  4
-+    d1l = *(__m64*)(block + 8 * 1); d1h = *(__m64*)(block + 8 * 1 + 4); //  8 12
-+    d2l = *(__m64*)(block + 8 * 2); d2h = *(__m64*)(block + 8 * 2 + 4); // 16 20
-+    d3l = *(__m64*)(block + 8 * 3); d3h = *(__m64*)(block + 8 * 3 + 4); // 24 28
-+    d4l = *(__m64*)(block + 8 * 4); d4h = *(__m64*)(block + 8 * 4 + 4); // 32 36
-+    d5l = *(__m64*)(block + 8 * 5); d5h = *(__m64*)(block + 8 * 5 + 4); // 40 44
-+    d6l = *(__m64*)(block + 8 * 6); d6h = *(__m64*)(block + 8 * 6 + 4); // 48 52
-+    d7l = *(__m64*)(block + 8 * 7); d7h = *(__m64*)(block + 8 * 7 + 4); // 56 60
-+
-+    // d0l[0] d0l[1] d1l[0] d2l[0]
-+    // d1l[1] d0l[2] d0l[3] d1l[2]
-+    h0 = _mm_unpacklo_pi16(d1l, d2l);
-+    r0 = _mm_unpacklo_pi32(d0l, h0);
-+    r1 = _mm_shuffle2_pi8(d1l, d0l, SHUF16X4(1, 6, 7, 2));
-+    r0 = _mm_sub_pi16(r0, _mm_cvtsi64_m64((uint16_t)last_dc_val));
-+    v0 = VEC_COMBINE(r0, r1);
-+
-+    // d2l[1] d3l[0] d4l[0] d3l[1]
-+    // d2l[2] d1l[3] d0h[0] d0h[1]
-+    h0 = _mm_srli_si64(_mm_unpacklo_pi32(d2l, d4l), 16);
-+    h2 = INSFI_M64(d1l, 0, d2l, 48);
-+    r0 = _mm_unpacklo_pi16(h0, d3l);
-+    r1 = _mm_alignr_pi8(d0h, h2, 4);
-+    v1 = VEC_COMBINE(r0, r1);
-+
-+    // d1h[0] d2l[3] d3l[2] d4l[1]
-+    // d5l[0] d6l[0] d5l[1] d4l[2]
-+    h0 = INSFI_M64(d2l, 32, d1h, 16);
-+    h1 = INSFI_M64(d4l, -32, d3l, 48);
-+    h2 = INSFI_M64(d4l, 16, d6l, 16);
-+    r0 = INSFI_M64(h1, 0, h0, 32);
-+    r1 = _mm_unpacklo_pi16(d5l, h2);
-+    v2 = VEC_COMBINE(r0, r1);
-+
-+    // d3l[3] d2h[0] d1h[1] d0h[2]
-+    // d0h[3] d1h[2] d2h[1] d3h[0]
-+    h0 = _mm_alignr_pi8(d2h, d3l, 6);
-+    h1 = INSFI_M64(d0h, 0, d1h, 32);
-+    h2 = _mm_unpackhi_pi32(d0h, d1h);
-+    h3 = _mm_unpacklo_pi32(d2h, d3h);
-+    r0 = INSFI_M64(h1, -16, h0, 32);
-+    r1 = _mm_shuffle2_pi8(h2, h3, c1256);
-+    v3 = VEC_COMBINE(r0, r1);
-+
-+    KLOOP_PREPARE(mask, 0)
-+
-+    // d4l[3] d5l[2] d6l[1] d7l[0]
-+    // d7l[1] d6l[2] d5l[3] d4h[0]
-+    h0 = _mm_unpackhi_pi32(d4l, d5l);
-+    h1 = _mm_unpacklo_pi32(d6l, d7l);
-+    h2 = INSFI_M64(d6l, 0, d7l, 32);
-+    h2 = INSFI_M64(d5l, 0, h2, 48);
-+    r0 = _mm_shuffle2_pi8(h0, h1, c1256);
-+    r1 = _mm_alignr_pi8(d4h, h2, 2);
-+    v0 = VEC_COMBINE(r0, r1);
-+
-+    // d3h[1] d2h[2] d1h[3] d2h[3]
-+    // d3h[2] d4h[1] d5h[0] d6l[3]
-+    h0 = _mm_slli_si64(INSFI_M64(d1h, 16, d3h, 32), 16);
-+    h2 = INSFI_M64(d4h, -32, d3h, 48);
-+    h3 = INSFI_M64(d6l, 32, d5h, 16);
-+    r0 = _mm_unpackhi_pi16(h0, d2h);
-+    r1 = _mm_alignr_pi8(h3, h2, 4);
-+    v1 = VEC_COMBINE(r0, r1);
-+
-+    // d7l[2] d7l[3] d6h[0] d5h[1]
-+    // d4h[2] d3h[3] d4h[3] d5h[2]
-+    h0 = INSFI_M64(d5h, 0, d6h, 16);
-+    h2 = _mm_slli_si64(_mm_unpackhi_pi32(d3h, d5h), 16);
-+    r0 = _mm_alignr_pi8(h0, d7l, 4);
-+    r1 = _mm_unpackhi_pi16(d4h, h2);
-+    v2 = VEC_COMBINE(r0, r1);
-+
-+    // d6h[1] d7h[0] d7h[1] d6h[2]
-+    // d5h[3] d6h[3] d7h[2] d7h[3]
-+    h0 = INSFI_M64(d6h, -16, d7h, 32);
-+    h2 = _mm_unpackhi_pi16(d5h, d6h);
-+    r0 = _mm_shuffle_pi16(h0, 0xd2);
-+    r1 = _mm_unpackhi_pi32(h2, d7h);
-+    v3 = VEC_COMBINE(r0, r1);
-+  }
-+#else
-+  v0 = _mm_setr_epi16(
-+      block[0] - last_dc_val, block[1], block[8], block[16],
-+      block[9], block[2], block[3], block[10]);
-+  v1 = _mm_setr_epi16(
-+      block[17], block[24], block[32], block[25],
-+      block[18], block[11], block[4], block[5]);
-+  v2 = _mm_setr_epi16(
-+      block[12], block[19], block[26], block[33],
-+      block[40], block[48], block[41], block[34]);
-+  v3 = _mm_setr_epi16(
-+      block[27], block[20], block[13], block[6],
-+      block[7], block[14], block[21], block[28]);
-+
-+  KLOOP_PREPARE(mask, 0)
-+
-+  v0 = _mm_setr_epi16(
-+      block[35], block[42], block[49], block[56],
-+      block[57], block[50], block[43], block[36]);
-+  v1 = _mm_setr_epi16(
-+      block[29], block[22], block[15], block[23],
-+      block[30], block[37], block[44], block[51]);
-+  v2 = _mm_setr_epi16(
-+      block[58], block[59], block[52], block[45],
-+      block[38], block[31], block[39], block[46]);
-+  v3 = _mm_setr_epi16(
-+      block[53], block[60], block[61], block[54],
-+      block[47], block[55], block[62], block[63]);
-+#endif
-+
-+  KLOOP_PREPARE(mask1, 32)
-+  mask |= mask1 << 32;
-+  mask = ~mask;
-+
-+  /* Encode the DC coefficient difference per section F.1.2.1 */
-+
-+  nbits = block_nbits[0];
-+  temp = block_diff[0];
-+  nbits = nbits ? 32 - __builtin_clz(nbits) : 0;
-+
-+  /* Emit the Huffman-coded symbol for the number of bits */
-+  code = dctbl->ehufco[nbits];
-+  size = dctbl->ehufsi[nbits];
-+  PUT_CODE(code, size)
-+
-+  /* Encode the AC coefficients per section F.1.2.2 */
-+
-+  /* e2k doesn't have a tzcnt instruction */
-+  mask = __builtin_e2k_bitrevd(mask) << 1;
-+
-+  for (i = 1; mask; i++, mask <<= 1) {
-+    r = __builtin_clzll(mask);
-+    mask <<= r;
-+    i += r;
-+    /* if run length > 15, must emit special run-length-16 codes (0xF0) */
-+    while (r > 15) {
-+      PUT_BITS(code_0xf0, size_0xf0)
-+      r -= 16;
-+    }
-+    nbits = block_nbits[i];
-+    temp = block_diff[i];
-+    nbits = 32 - __builtin_clz(nbits);
-+    /* Emit Huffman symbol for run length / number of bits */
-+    /* r = r << 4 | nbits; */
-+    r = __builtin_e2k_insfd(r, 4 * 63 + 64, nbits);
-+    code = actbl->ehufco[r];
-+    size = actbl->ehufsi[r];
-+    PUT_CODE(code, size)
-+  }
-+
-+  if (i != 64) {
-+    PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0])
-+  }
-+
-+  state_ptr->cur.put_buffer = put_buffer;
-+  state_ptr->cur.free_bits = free_bits;
-+  return buffer;
-+}
-diff --git a/simd/e2k/jcphuff-e2k.c b/simd/e2k/jcphuff-e2k.c
-new file mode 100644
-index 0000000..f69afeb
---- /dev/null
-+++ b/simd/e2k/jcphuff-e2k.c
-@@ -0,0 +1,145 @@
-+/*
-+ * Elbrus optimizations for libjpeg-turbo
-+ *
-+ * Copyright (C) 2022, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
-+ *
-+ * This software is provided 'as-is', without any express or implied
-+ * warranty.  In no event will the authors be held liable for any damages
-+ * arising from the use of this software.
-+ *
-+ * Permission is granted to anyone to use this software for any purpose,
-+ * including commercial applications, and to alter it and redistribute it
-+ * freely, subject to the following restrictions:
-+ *
-+ * 1. The origin of this software must not be misrepresented; you must not
-+ *    claim that you wrote the original software. If you use this software
-+ *    in a product, an acknowledgment in the product documentation would be
-+ *    appreciated but is not required.
-+ * 2. Altered source versions must be plainly marked as such, and must not be
-+ *    misrepresented as being the original software.
-+ * 3. This notice may not be removed or altered from any source distribution.
-+ */
-+
-+#include "jsimd_e2k.h"
-+
-+#define X(i) coefs[i] = block[jpeg_natural_order_start[i]];
-+#define Y(i) coefs[i] = i < rem ? block[jpeg_natural_order_start[i]] : 0;
-+
-+#define LOOP \
-+  for (i = 0; i < Sl >> 4; i++) { \
-+    X(0) X(1) X(2) X(3) X(4) X(5) X(6) X(7) \
-+    X(8) X(9) X(10) X(11) X(12) X(13) X(14) X(15) \
-+    BLOCK16 \
-+    jpeg_natural_order_start += 16; \
-+  } \
-+  rem = Sl & 15; \
-+  if (Sl & 8) { \
-+    X(0) X(1) X(2) X(3) X(4) X(5) X(6) X(7) \
-+    Y(8) Y(9) Y(10) Y(11) Y(12) Y(13) Y(14) \
-+    coefs[15] = 0; \
-+    BLOCK16 \
-+  } else if (rem > 0) { \
-+    Y(0) Y(1) Y(2) Y(3) Y(4) Y(5) Y(6) Y(7) \
-+    BLOCK8 \
-+  }
-+
-+void jsimd_encode_mcu_AC_first_prepare_e2k
-+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
-+   JCOEF *values, size_t *zerobits)
-+{
-+  JCOEF *diff = values + DCTSIZE2;
-+  int16_t __attribute__((aligned(16))) coefs[16];
-+  __m128i v0, v1, v2, v3;
-+  __m128i c0 = _mm_setzero_si128(), shr = _mm_cvtsi32_si128(Al);
-+  int i, rem;
-+
-+#define BLOCK16 \
-+  v0 = _mm_load_si128((__m128i*)coefs); \
-+  v1 = _mm_load_si128((__m128i*)coefs + 1); \
-+  v2 = _mm_srai_epi16(v0, 15); \
-+  v3 = _mm_srai_epi16(v1, 15); \
-+  v0 = _mm_sra_epi16(_mm_abs_epi16(v0), shr); \
-+  v1 = _mm_sra_epi16(_mm_abs_epi16(v1), shr); \
-+  v2 = _mm_xor_si128(v0, v2); \
-+  v3 = _mm_xor_si128(v1, v3); \
-+  _mm_store_si128((__m128i*)values, v0); \
-+  _mm_store_si128((__m128i*)values + 1, v1); \
-+  _mm_store_si128((__m128i*)diff, v2); \
-+  _mm_store_si128((__m128i*)diff + 1, v3); \
-+  v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, v1), c0); \
-+  ((uint16_t*)zerobits)[i] = ~_mm_movemask_epi8(v2); \
-+  values += 16; diff += 16;
-+
-+#define BLOCK8 \
-+  v0 = _mm_load_si128((__m128i*)coefs); \
-+  v2 = _mm_srai_epi16(v0, 15); \
-+  v0 = _mm_sra_epi16(_mm_abs_epi16(v0), shr); \
-+  v2 = _mm_xor_si128(v0, v2); \
-+  _mm_store_si128((__m128i*)values, v0); \
-+  _mm_store_si128((__m128i*)diff, v2); \
-+  v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, c0), c0); \
-+  ((uint16_t*)zerobits)[i] = ~_mm_movemask_epi8(v2); \
-+  values += 8; diff += 8;
-+
-+  ((uint64_t*)zerobits)[0] = 0;
-+  LOOP
-+#undef BLOCK16
-+#undef BLOCK8
-+
-+  for (i = (64 - Sl) >> 3; i; i--) {
-+    _mm_store_si128((__m128i*)values, c0);
-+    _mm_store_si128((__m128i*)diff, c0);
-+    values += 8; diff += 8;
-+  }
-+}
-+
-+int jsimd_encode_mcu_AC_refine_prepare_e2k
-+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
-+   JCOEF *absvalues, size_t *bits)
-+{
-+  union { uint64_t q; uint16_t w[4]; } mask1 = { 0 };
-+  int16_t __attribute__((aligned(16))) coefs[16];
-+  __m128i v0, v1, v2, c1 = _mm_set1_epi8(1);
-+  __m128i c0 = _mm_setzero_si128(), shr = _mm_cvtsi32_si128(Al);
-+  int i, rem;
-+
-+#define BLOCK16 \
-+  v0 = _mm_load_si128((__m128i*)coefs); \
-+  v1 = _mm_load_si128((__m128i*)coefs + 1); \
-+  v2 = _mm_packs_epi16(v0, v1); \
-+  ((uint16_t*)bits)[4 + i] = ~_mm_movemask_epi8(v2); \
-+  v0 = _mm_sra_epi16(_mm_abs_epi16(v0), shr); \
-+  v1 = _mm_sra_epi16(_mm_abs_epi16(v1), shr); \
-+  v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, v1), c0); \
-+  ((uint16_t*)bits)[i] = ~_mm_movemask_epi8(v2); \
-+  _mm_store_si128((__m128i*)absvalues, v0); \
-+  _mm_store_si128((__m128i*)absvalues + 1, v1); \
-+  v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, v1), c1); \
-+  mask1.w[i] = _mm_movemask_epi8(v2); \
-+  absvalues += 16;
-+
-+#define BLOCK8 \
-+  v0 = _mm_load_si128((__m128i*)coefs); \
-+  v2 = _mm_packs_epi16(v0, c0); \
-+  ((uint16_t*)bits)[4 + i] = ~_mm_movemask_epi8(v2); \
-+  v0 = _mm_sra_epi16(_mm_abs_epi16(v0), shr); \
-+  v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, c0), c0); \
-+  ((uint16_t*)bits)[i] = ~_mm_movemask_epi8(v2); \
-+  _mm_store_si128((__m128i*)absvalues, v0); \
-+  v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, c0), c1); \
-+  mask1.w[i] = _mm_movemask_epi8(v2); \
-+  absvalues += 8;
-+
-+  ((uint64_t*)bits)[0] = 0; /* zero */
-+  ((uint64_t*)bits)[1] = 0; /* sign */
-+  LOOP
-+#undef BLOCK16
-+#undef BLOCK8
-+
-+  for (i = (64 - Sl) >> 3; i; i--) {
-+    _mm_store_si128((__m128i*)absvalues, c0);
-+    absvalues += 8;
-+  }
-+
-+  return 63 - __builtin_clzll(mask1.q | 1);
-+}
-diff --git a/simd/e2k/jcsample-e2k.c b/simd/e2k/jcsample-e2k.c
-new file mode 100644
-index 0000000..cac8897
---- /dev/null
-+++ b/simd/e2k/jcsample-e2k.c
-@@ -0,0 +1,203 @@
-+/*
-+ * Elbrus optimizations for libjpeg-turbo
-+ *
-+ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
-+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
-+ *
-+ * This software is provided 'as-is', without any express or implied
-+ * warranty.  In no event will the authors be held liable for any damages
-+ * arising from the use of this software.
-+ *
-+ * Permission is granted to anyone to use this software for any purpose,
-+ * including commercial applications, and to alter it and redistribute it
-+ * freely, subject to the following restrictions:
-+ *
-+ * 1. The origin of this software must not be misrepresented; you must not
-+ *    claim that you wrote the original software. If you use this software
-+ *    in a product, an acknowledgment in the product documentation would be
-+ *    appreciated but is not required.
-+ * 2. Altered source versions must be plainly marked as such, and must not be
-+ *    misrepresented as being the original software.
-+ * 3. This notice may not be removed or altered from any source distribution.
-+ */
-+
-+/* CHROMA DOWNSAMPLING */
-+
-+#include "jsimd_e2k.h"
-+#include "jcsample.h"
-+
-+void jsimd_h2v1_downsample_e2k(JDIMENSION image_width,
-+                               int max_v_samp_factor,
-+                               JDIMENSION v_samp_factor,
-+                               JDIMENSION width_in_blocks,
-+                               JSAMPARRAY input_data,
-+                               JSAMPARRAY output_data)
-+{
-+  int outcol;
-+  JDIMENSION output_cols = width_in_blocks * DCTSIZE, outrow;
-+  JSAMPROW inptr, outptr;
-+
-+  __m128i this0, next0, out;
-+  __m128i this0e, this0o, next0e, next0o, outl, outh;
-+
-+  /* Constants */
-+  __m128i pw_bias = _mm_set1_epi32(1 << 16),
-+    even_mask = _mm_set1_epi16(255);
-+#ifdef NEED_ALIGN8
-+  ALIGN8_COMMON
-+  ALIGN8_VARS(src)
-+#endif
-+
-+  expand_right_edge(input_data, max_v_samp_factor,
-+                    image_width, output_cols * 2);
-+
-+  if (output_cols > 0)
-+  for (outrow = 0; outrow < v_samp_factor; outrow++) {
-+    outptr = output_data[outrow];
-+    inptr = input_data[outrow];
-+
-+#ifdef NEED_ALIGN8
-+    ALIGN8_START(inptr, src)
-+#endif
-+    PRAGMA_E2K("ivdep")
-+    for (outcol = output_cols; outcol > 8;
-+         outcol -= 16, outptr += 16) {
-+
-+#ifdef NEED_ALIGN8
-+      ALIGN8_READ16(this0, src, 0)
-+      ALIGN8_READ16(next0, src, 1)
-+      src_ptr += 4;
-+#else
-+      this0 = VEC_LD(inptr);
-+      next0 = VEC_LD(inptr + 16);
-+      inptr += 32;
-+#endif
-+      this0e = _mm_and_si128(this0, even_mask);
-+      this0o = _mm_srli_epi16(this0, 8);
-+      outl = _mm_add_epi16(this0e, this0o);
-+      outl = _mm_srli_epi16(_mm_add_epi16(outl, pw_bias), 1);
-+      next0e = _mm_and_si128(next0, even_mask);
-+      next0o = _mm_srli_epi16(next0, 8);
-+      outh = _mm_add_epi16(next0e, next0o);
-+      outh = _mm_srli_epi16(_mm_add_epi16(outh, pw_bias), 1);
-+
-+      out = _mm_packus_epi16(outl, outh);
-+      VEC_ST(outptr, out);
-+    }
-+    if (outcol > 0) {
-+#ifdef NEED_ALIGN8
-+      ALIGN8_READ16(this0, src, 0)
-+#else
-+      this0 = VEC_LD(inptr);
-+#endif
-+      this0e = _mm_and_si128(this0, even_mask);
-+      this0o = _mm_srli_epi16(this0, 8);
-+      outl = _mm_add_epi16(this0e, this0o);
-+      outl = _mm_srli_epi16(_mm_add_epi16(outl, pw_bias), 1);
-+
-+      out = _mm_packus_epi16(outl, outl);
-+      VEC_STL(outptr, out);
-+    }
-+  }
-+}
-+
-+
-+void jsimd_h2v2_downsample_e2k(JDIMENSION image_width, int max_v_samp_factor,
-+                               JDIMENSION v_samp_factor,
-+                               JDIMENSION width_in_blocks,
-+                               JSAMPARRAY input_data, JSAMPARRAY output_data)
-+{
-+  int outcol;
-+  JDIMENSION output_cols = width_in_blocks * DCTSIZE, outrow;
-+  JSAMPROW inptr0, inptr1, outptr;
-+
-+  __m128i this0, next0, this1, next1, out;
-+  __m128i this0e, this0o, next0e, next0o, this1e, this1o,
-+    next1e, next1o, out0l, out0h, out1l, out1h, outl, outh;
-+
-+  /* Constants */
-+  __m128i pw_bias = _mm_set1_epi32(1 | 2 << 16),
-+    even_mask = _mm_set1_epi16(255);
-+#ifdef NEED_ALIGN8
-+  ALIGN8_COMMON
-+  ALIGN8_VARS(src0)
-+  ALIGN8_VARS(src1)
-+#endif
-+
-+  expand_right_edge(input_data, max_v_samp_factor,
-+                    image_width, output_cols * 2);
-+
-+  if (output_cols > 0)
-+  for (outrow = 0; outrow < v_samp_factor; outrow++) {
-+    inptr0 = input_data[outrow * 2];
-+    inptr1 = input_data[outrow * 2 + 1];
-+    outptr = output_data[outrow];
-+
-+#ifdef NEED_ALIGN8
-+    ALIGN8_START(inptr0, src0)
-+    ALIGN8_START(inptr1, src1)
-+#endif
-+    PRAGMA_E2K("ivdep")
-+    for (outcol = output_cols; outcol > 8;
-+         outcol -= 16, outptr += 16) {
-+
-+#ifdef NEED_ALIGN8
-+      ALIGN8_READ16(this0, src0, 0) src0_ptr += 2;
-+      ALIGN8_READ16(this1, src1, 0) src1_ptr += 2;
-+#else
-+      this0 = VEC_LD(inptr0); inptr0 += 16;
-+      this1 = VEC_LD(inptr1); inptr1 += 16;
-+#endif
-+      this0e = _mm_and_si128(this0, even_mask);
-+      this1e = _mm_and_si128(this1, even_mask);
-+      this0o = _mm_srli_epi16(this0, 8);
-+      this1o = _mm_srli_epi16(this1, 8);
-+      out0l = _mm_add_epi16(this0e, this0o);
-+      out1l = _mm_add_epi16(this1e, this1o);
-+
-+      outl = _mm_add_epi16(out0l, out1l);
-+      outl = _mm_srli_epi16(_mm_add_epi16(outl, pw_bias), 2);
-+
-+#ifdef NEED_ALIGN8
-+      ALIGN8_READ16(next0, src0, 0) src0_ptr += 2;
-+      ALIGN8_READ16(next1, src1, 0) src1_ptr += 2;
-+#else
-+      next0 = VEC_LD(inptr0); inptr0 += 16;
-+      next1 = VEC_LD(inptr1); inptr1 += 16;
-+#endif
-+      next0e = _mm_and_si128(next0, even_mask);
-+      next1e = _mm_and_si128(next1, even_mask);
-+      next0o = _mm_srli_epi16(next0, 8);
-+      next1o = _mm_srli_epi16(next1, 8);
-+      out0h = _mm_add_epi16(next0e, next0o);
-+      out1h = _mm_add_epi16(next1e, next1o);
-+
-+      outh = _mm_add_epi16(out0h, out1h);
-+      outh = _mm_srli_epi16(_mm_add_epi16(outh, pw_bias), 2);
-+
-+      out = _mm_packus_epi16(outl, outh);
-+      VEC_ST(outptr, out);
-+    }
-+    if (outcol > 0) {
-+#ifdef NEED_ALIGN8
-+      ALIGN8_READ16(this0, src0, 0)
-+      ALIGN8_READ16(this1, src1, 0)
-+#else
-+      this0 = VEC_LD(inptr0);
-+      this1 = VEC_LD(inptr1);
-+#endif
-+      this0e = _mm_and_si128(this0, even_mask);
-+      this1e = _mm_and_si128(this1, even_mask);
-+      this0o = _mm_srli_epi16(this0, 8);
-+      this1o = _mm_srli_epi16(this1, 8);
-+      out0l = _mm_add_epi16(this0e, this0o);
-+      out1l = _mm_add_epi16(this1e, this1o);
-+
-+      outl = _mm_add_epi16(out0l, out1l);
-+      outl = _mm_srli_epi16(_mm_add_epi16(outl, pw_bias), 2);
-+
-+      out = _mm_packus_epi16(outl, outl);
-+      VEC_STL(outptr, out);
-+    }
-+  }
-+}
-diff --git a/simd/e2k/jcsample.h b/simd/e2k/jcsample.h
-new file mode 100644
-index 0000000..2ac4816
---- /dev/null
-+++ b/simd/e2k/jcsample.h
-@@ -0,0 +1,28 @@
-+/*
-+ * jcsample.h
-+ *
-+ * This file was part of the Independent JPEG Group's software:
-+ * Copyright (C) 1991-1996, Thomas G. Lane.
-+ * For conditions of distribution and use, see the accompanying README.ijg
-+ * file.
-+ */
-+
-+LOCAL(void)
-+expand_right_edge(JSAMPARRAY image_data, int num_rows, JDIMENSION input_cols,
-+                  JDIMENSION output_cols)
-+{
-+  register JSAMPROW ptr;
-+  register JSAMPLE pixval;
-+  register int count;
-+  int row;
-+  int numcols = (int)(output_cols - input_cols);
-+
-+  if (numcols > 0) {
-+    for (row = 0; row < num_rows; row++) {
-+      ptr = image_data[row] + input_cols;
-+      pixval = ptr[-1];         /* don't need GETJSAMPLE() here */
-+      for (count = numcols; count > 0; count--)
-+        *ptr++ = pixval;
-+    }
-+  }
-+}
-diff --git a/simd/e2k/jdcolext-e2k.c b/simd/e2k/jdcolext-e2k.c
-new file mode 100644
-index 0000000..4f12aef
---- /dev/null
-+++ b/simd/e2k/jdcolext-e2k.c
-@@ -0,0 +1,258 @@
-+/*
-+ * Elbrus optimizations for libjpeg-turbo
-+ *
-+ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
-+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
-+ *
-+ * This software is provided 'as-is', without any express or implied
-+ * warranty.  In no event will the authors be held liable for any damages
-+ * arising from the use of this software.
-+ *
-+ * Permission is granted to anyone to use this software for any purpose,
-+ * including commercial applications, and to alter it and redistribute it
-+ * freely, subject to the following restrictions:
-+ *
-+ * 1. The origin of this software must not be misrepresented; you must not
-+ *    claim that you wrote the original software. If you use this software
-+ *    in a product, an acknowledgment in the product documentation would be
-+ *    appreciated but is not required.
-+ * 2. Altered source versions must be plainly marked as such, and must not be
-+ *    misrepresented as being the original software.
-+ * 3. This notice may not be removed or altered from any source distribution.
-+ */
-+
-+/* This file is included by jdcolor-e2k.c */
-+
-+void ycc_rgbn_convert(JDIMENSION out_width, JSAMPIMAGE input_buf,
-+                      JDIMENSION input_row, JSAMPARRAY output_buf,
-+                      int num_rows, int shuf_idx)
-+{
-+  JSAMPROW outptr, inptr0, inptr1, inptr2;
-+  uint8_t __attribute__((aligned(16))) tmpbuf[PIXELSIZE * 16];
-+
-+  __m128i rgb0, rgb1, rgb2, rgb3, y, cb, cr;
-+  __m128i rg0, rg1, bx0, bx1, yl, yh, cbl, cbh,
-+    crl, crh, rl, rh, gl, gh, bl, bh, g0w, g1w, g2w, g3w;
-+  __m128i g0, g1, g2, g3;
-+
-+  /* Constants
-+   * NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17
-+   * high-order bits, not 16.
-+   */
-+  __m128i pw_f0402 = _mm_set1_epi16(F_0_402 >> 1),
-+    pw_mf0228 = _mm_set1_epi16(-F_0_228 >> 1),
-+    pw_mf0344_f0285 = _mm_setr_epi16(__4X2(-F_0_344, F_0_285)),
-+    pb_255 = _mm_set1_epi8(-1),
-+    pw_cj = _mm_set1_epi16(CENTERJSAMPLE),
-+    pd_onehalf = _mm_set1_epi32(ONE_HALF),
-+    pb_zero = _mm_setzero_si128();
-+  RGB_SHUFFLE_INIT
-+#ifdef NEED_ALIGN8
-+  ALIGN8_COMMON
-+  ALIGN8_VARS(src0)
-+  ALIGN8_VARS(src1)
-+  ALIGN8_VARS(src2)
-+#endif
-+
-+  if (out_width > 0)
-+  while (--num_rows >= 0) {
-+    int num_cols;
-+    inptr0 = input_buf[0][input_row];
-+    inptr1 = input_buf[1][input_row];
-+    inptr2 = input_buf[2][input_row];
-+    input_row++;
-+    outptr = *output_buf++;
-+
-+    if (out_width >= 16) {
-+#ifdef NEED_ALIGN8
-+      ALIGN8_START(inptr0, src0)
-+      ALIGN8_START(inptr1, src1)
-+      ALIGN8_START(inptr2, src2)
-+      inptr0 += out_width & -16;
-+      inptr1 += out_width & -16;
-+      inptr2 += out_width & -16;
-+#endif
-+      PRAGMA_E2K("ivdep")
-+      for (num_cols = out_width; num_cols >= 16;
-+           num_cols -= 16, outptr += PIXELSIZE * 16) {
-+#ifdef NEED_ALIGN8
-+        ALIGN8_READ16(y, src0, 0) src0_ptr += 2;
-+        ALIGN8_READ16(cb, src1, 0) src1_ptr += 2;
-+        ALIGN8_READ16(cr, src2, 0) src2_ptr += 2;
-+#else
-+        y = VEC_LD(inptr0); inptr0 += 16;
-+        cb = VEC_LD(inptr1); inptr1 += 16;
-+        cr = VEC_LD(inptr2); inptr2 += 16;
-+#endif
-+        CALC_RGB
-+        RGB_SHUFFLE
-+        VEC_ST(outptr, rgb0);
-+        VEC_ST(outptr + 16, rgb1);
-+        VEC_ST(outptr + 32, rgb2);
-+#if PIXELSIZE == 4
-+        VEC_ST(outptr + 48, rgb3);
-+#endif
-+      }
-+    }
-+
-+    num_cols = out_width & 15;
-+    if (num_cols) {
-+      int i;
-+      for (i = 0; i < num_cols; i++) {
-+        tmpbuf[i] = inptr0[i];
-+        tmpbuf[i + 16] = inptr1[i];
-+        tmpbuf[i + 32] = inptr2[i];
-+      }
-+      y = VEC_LD(tmpbuf);
-+      cb = VEC_LD(tmpbuf + 16);
-+      cr = VEC_LD(tmpbuf + 32);
-+      CALC_RGB
-+      RGB_SHUFFLE
-+      VEC_ST(tmpbuf, rgb0);
-+      VEC_ST(tmpbuf + 16, rgb1);
-+      VEC_ST(tmpbuf + 32, rgb2);
-+#if PIXELSIZE == 4
-+      VEC_ST(tmpbuf + 48, rgb3);
-+#endif
-+      memcpy(outptr, tmpbuf, num_cols * PIXELSIZE);
-+    }
-+  }
-+}
-+
-+void ycc_rgbn_merged(JDIMENSION out_width, JSAMPIMAGE input_buf,
-+                     JDIMENSION in_row_group_ctr,
-+                     JDIMENSION in_row_group_ctr_y,
-+                     JSAMPARRAY output_buf, int shuf_idx)
-+{
-+  JSAMPROW outptr, inptr0, inptr1, inptr2;
-+  int num_cols;
-+  uint8_t __attribute__((aligned(16))) tmpbuf[4 * 16];
-+
-+  __m128i rgb0, rgb1, rgb2, rgb3, y, cb, cr;
-+  __m128i rg0, rg1, bx0, bx1, yl, yh, cbl, cbh,
-+    crl, crh, r_yl, r_yh, g_yl, g_yh, b_yl, b_yh, g_y0w, g_y1w, g_y2w, g_y3w,
-+    rl, rh, gl, gh, bl, bh;
-+  __m128i g_y0, g_y1, g_y2, g_y3;
-+
-+  /* Constants
-+   * NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17
-+   * high-order bits, not 16.
-+   */
-+  __m128i pw_f0402 = _mm_set1_epi16(F_0_402 >> 1),
-+    pw_mf0228 = _mm_set1_epi16(-F_0_228 >> 1),
-+    pw_mf0344_f0285 = _mm_setr_epi16(__4X2(-F_0_344, F_0_285)),
-+    pb_255 = _mm_set1_epi8(-1),
-+    pw_cj = _mm_set1_epi16(CENTERJSAMPLE),
-+    pd_onehalf = _mm_set1_epi32(ONE_HALF),
-+    pb_zero = _mm_setzero_si128();
-+  RGB_SHUFFLE_INIT
-+#ifdef NEED_ALIGN8
-+  ALIGN8_COMMON
-+  ALIGN8_VARS(src0)
-+  ALIGN8_VARS(src1)
-+  ALIGN8_VARS(src2)
-+#endif
-+
-+  inptr0 = input_buf[0][in_row_group_ctr_y];
-+  inptr1 = input_buf[1][in_row_group_ctr];
-+  inptr2 = input_buf[2][in_row_group_ctr];
-+  outptr = output_buf[0];
-+
-+  if (out_width >= 32) {
-+#ifdef NEED_ALIGN8
-+    ALIGN8_START(inptr0, src0)
-+    ALIGN8_START(inptr1, src1)
-+    ALIGN8_START(inptr2, src2)
-+    inptr0 += out_width & -32;
-+    inptr1 += (out_width & -32) >> 1;
-+    inptr2 += (out_width & -32) >> 1;
-+#endif
-+    PRAGMA_E2K("ivdep")
-+    for (num_cols = out_width; num_cols >= 32; num_cols -= 32) {
-+
-+#ifdef NEED_ALIGN8
-+      ALIGN8_READ16(cb, src1, 0) src1_ptr += 2;
-+      ALIGN8_READ16(cr, src2, 0) src2_ptr += 2;
-+#else
-+      cb = VEC_LD(inptr1); inptr1 += 16;
-+      cr = VEC_LD(inptr2); inptr2 += 16;
-+#endif
-+      CALC_MERGED1
-+
-+#ifdef NEED_ALIGN8
-+      ALIGN8_READ16(y, src0, 0) src0_ptr += 2;
-+#else
-+      y = VEC_LD(inptr0); inptr0 += 16;
-+#endif
-+      CALC_MERGED2(r_yl, g_yl, b_yl)
-+      RGB_SHUFFLE
-+      VEC_ST(outptr, rgb0);
-+      VEC_ST(outptr + 16, rgb1);
-+      VEC_ST(outptr + 32, rgb2);
-+#if PIXELSIZE == 4
-+      VEC_ST(outptr + 48, rgb3);
-+#endif
-+      outptr += PIXELSIZE * 16;
-+
-+#ifdef NEED_ALIGN8
-+      ALIGN8_READ16(y, src0, 0) src0_ptr += 2;
-+#else
-+      y = VEC_LD(inptr0); inptr0 += 16;
-+#endif
-+      CALC_MERGED2(r_yh, g_yh, b_yh)
-+      RGB_SHUFFLE
-+      VEC_ST(outptr, rgb0);
-+      VEC_ST(outptr + 16, rgb1);
-+      VEC_ST(outptr + 32, rgb2);
-+#if PIXELSIZE == 4
-+      VEC_ST(outptr + 48, rgb3);
-+#endif
-+      outptr += PIXELSIZE * 16;
-+    }
-+  }
-+
-+  num_cols = out_width & 31;
-+  if (num_cols) {
-+    int i;
-+    for (i = 0; i < (num_cols + 1) >> 1; i++) {
-+      tmpbuf[i] = inptr1[i];
-+      tmpbuf[i + 16] = inptr2[i];
-+      tmpbuf[i * 2 + 32] = inptr0[i * 2];
-+      tmpbuf[i * 2 + 32 + 1] = inptr0[i * 2 + 1];
-+    }
-+    cb = VEC_LD(tmpbuf);
-+    cr = VEC_LD(tmpbuf + 16);
-+    CALC_MERGED1
-+
-+    y = VEC_LD(tmpbuf + 32);
-+    CALC_MERGED2(r_yl, g_yl, b_yl)
-+    RGB_SHUFFLE
-+    if (num_cols >= 16) {
-+      VEC_ST(outptr, rgb0);
-+      VEC_ST(outptr + 16, rgb1);
-+      VEC_ST(outptr + 32, rgb2);
-+#if PIXELSIZE == 4
-+      VEC_ST(outptr + 48, rgb3);
-+#endif
-+      outptr += PIXELSIZE * 16;
-+
-+      y = VEC_LD(tmpbuf + 48);
-+      CALC_MERGED2(r_yh, g_yh, b_yh)
-+      RGB_SHUFFLE
-+    }
-+    VEC_ST(tmpbuf, rgb0);
-+    VEC_ST(tmpbuf + 16, rgb1);
-+    VEC_ST(tmpbuf + 32, rgb2);
-+#if PIXELSIZE == 4
-+    VEC_ST(tmpbuf + 48, rgb3);
-+#endif
-+    memcpy(outptr, tmpbuf, (out_width & 15) * PIXELSIZE);
-+  }
-+}
-+
-+#undef RGB_SHUFFLE_INIT
-+#undef RGB_SHUFFLE
-+#undef PIXELSIZE
-+#undef ycc_rgbn_convert
-+#undef ycc_rgbn_merged
-+
-diff --git a/simd/e2k/jdcolor-e2k.c b/simd/e2k/jdcolor-e2k.c
-new file mode 100644
-index 0000000..94c80e9
---- /dev/null
-+++ b/simd/e2k/jdcolor-e2k.c
-@@ -0,0 +1,289 @@
-+/*
-+ * Elbrus optimizations for libjpeg-turbo
-+ *
-+ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
-+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
-+ *
-+ * This software is provided 'as-is', without any express or implied
-+ * warranty.  In no event will the authors be held liable for any damages
-+ * arising from the use of this software.
-+ *
-+ * Permission is granted to anyone to use this software for any purpose,
-+ * including commercial applications, and to alter it and redistribute it
-+ * freely, subject to the following restrictions:
-+ *
-+ * 1. The origin of this software must not be misrepresented; you must not
-+ *    claim that you wrote the original software. If you use this software
-+ *    in a product, an acknowledgment in the product documentation would be
-+ *    appreciated but is not required.
-+ * 2. Altered source versions must be plainly marked as such, and must not be
-+ *    misrepresented as being the original software.
-+ * 3. This notice may not be removed or altered from any source distribution.
-+ */
-+
-+/* YCC --> RGB CONVERSION */
-+
-+#include "jsimd_e2k.h"
-+
-+#define F_0_344  22554              /* FIX(0.34414) */
-+#define F_0_714  46802              /* FIX(0.71414) */
-+#define F_1_402  91881              /* FIX(1.40200) */
-+#define F_1_772  116130             /* FIX(1.77200) */
-+#define F_0_402  (F_1_402 - 65536)  /* FIX(1.40200) - FIX(1) */
-+#define F_0_285  (65536 - F_0_714)  /* FIX(1) - FIX(0.71414) */
-+#define F_0_228  (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */
-+
-+#define SCALEBITS  16
-+#define ONE_HALF  (1 << (SCALEBITS - 1))
-+
-+static const uint8_t __attribute__((aligned(16)))
-+#if defined(__iset__) && __iset__ >= 5
-+ycc_rgb_shuf_const[7][48] = {
-+#define SHUF_CONST3          \
-+  C0,      C1,      C2,      \
-+  C0 + 4,  C1 + 4,  C2 + 4,  \
-+  C0 + 8,  C1 + 8,  C2 + 8,  \
-+  C0 + 12, C1 + 12, C2 + 12, \
-+  C0 + 16, C1 + 16, C2 + 16, \
-+  C0 + 20, C1 + 4,  C2 + 4,  \
-+  C0 + 8,  C1 + 8,  C2 + 8,  \
-+  C0 + 12, C1 + 12, C2 + 12, \
-+  C0 + 16, C1 + 16, C2 + 16, \
-+  C0 + 20, C1 + 20, C2 + 20, \
-+  C0 + 24, C1 + 24, C2 + 8,  \
-+  C0 + 12, C1 + 12, C2 + 12, \
-+  C0 + 16, C1 + 16, C2 + 16, \
-+  C0 + 20, C1 + 20, C2 + 20, \
-+  C0 + 24, C1 + 24, C2 + 24, \
-+  C0 + 28, C1 + 28, C2 + 28
-+#else
-+ycc_rgb_shuf_const[7][24] = {
-+#define SHUF_CONST3          \
-+  C0,      C1,      C2,      \
-+  C0 + 4,  C1 + 4,  C2 + 4,  \
-+  C0 + 8,  C1 + 8,  C2,      \
-+  C0 + 4,  C1 + 4,  C2 + 4,  \
-+  C0 + 8,  C1 + 8,  C2 + 8,  \
-+  C0 + 12, C1 + 4,  C2 + 4,  \
-+  C0 + 8,  C1 + 8,  C2 + 8,  \
-+  C0 + 12, C1 + 12, C2 + 12
-+#endif
-+
-+#define SHUF_CONST4 C0, C1, C2, C3, C0 + 4, C1 + 4, C2 + 4, C3 + 4, \
-+  C0 + 8, C1 + 8, C2 + 8, C3 + 8, C0 + 12, C1 + 12, C2 + 12, C3 + 12
-+
-+#define TMP_RED    RGB_RED
-+#define TMP_GREEN  RGB_GREEN
-+#define TMP_BLUE   RGB_BLUE
-+#define PIXELSIZE  RGB_PIXELSIZE
-+#include "jdcoltab-e2k.c"
-+  ,
-+#define TMP_RED    EXT_RGB_RED
-+#define TMP_GREEN  EXT_RGB_GREEN
-+#define TMP_BLUE   EXT_RGB_BLUE
-+#define PIXELSIZE  EXT_RGB_PIXELSIZE
-+#include "jdcoltab-e2k.c"
-+  ,
-+#define TMP_RED    EXT_RGBX_RED
-+#define TMP_GREEN  EXT_RGBX_GREEN
-+#define TMP_BLUE   EXT_RGBX_BLUE
-+#define PIXELSIZE  EXT_RGBX_PIXELSIZE
-+#include "jdcoltab-e2k.c"
-+  ,
-+#define TMP_RED    EXT_BGR_RED
-+#define TMP_GREEN  EXT_BGR_GREEN
-+#define TMP_BLUE   EXT_BGR_BLUE
-+#define PIXELSIZE  EXT_BGR_PIXELSIZE
-+#include "jdcoltab-e2k.c"
-+  ,
-+#define TMP_RED    EXT_BGRX_RED
-+#define TMP_GREEN  EXT_BGRX_GREEN
-+#define TMP_BLUE   EXT_BGRX_BLUE
-+#define PIXELSIZE  EXT_BGRX_PIXELSIZE
-+#include "jdcoltab-e2k.c"
-+  ,
-+#define TMP_RED    EXT_XBGR_RED
-+#define TMP_GREEN  EXT_XBGR_GREEN
-+#define TMP_BLUE   EXT_XBGR_BLUE
-+#define PIXELSIZE  EXT_XBGR_PIXELSIZE
-+#include "jdcoltab-e2k.c"
-+  ,
-+#define TMP_RED    EXT_XRGB_RED
-+#define TMP_GREEN  EXT_XRGB_GREEN
-+#define TMP_BLUE   EXT_XRGB_BLUE
-+#define PIXELSIZE  EXT_XRGB_PIXELSIZE
-+#include "jdcoltab-e2k.c"
-+};
-+
-+  /* (Original)
-+   * R = Y                + 1.40200 * Cr
-+   * G = Y - 0.34414 * Cb - 0.71414 * Cr
-+   * B = Y + 1.77200 * Cb
-+   *
-+   * (This implementation)
-+   * R = Y                + 0.40200 * Cr + Cr
-+   * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
-+   * B = Y - 0.22800 * Cb + Cb + Cb
-+   */
-+
-+#define CALC_RGB \
-+  yl = _mm_unpacklo_epi8(y, pb_zero); \
-+  yh = _mm_unpackhi_epi8(y, pb_zero); \
-+  \
-+  cbl = _mm_unpacklo_epi8(cb, pb_zero); \
-+  cbh = _mm_unpackhi_epi8(cb, pb_zero); \
-+  cbl = _mm_sub_epi16(cbl, pw_cj); \
-+  cbh = _mm_sub_epi16(cbh, pw_cj); \
-+  \
-+  crl = _mm_unpacklo_epi8(cr, pb_zero); \
-+  crh = _mm_unpackhi_epi8(cr, pb_zero); \
-+  crl = _mm_sub_epi16(crl, pw_cj); \
-+  crh = _mm_sub_epi16(crh, pw_cj); \
-+  \
-+  bl = _mm_mulhrs_epi16(cbl, pw_mf0228); \
-+  bh = _mm_mulhrs_epi16(cbh, pw_mf0228); \
-+  bl = _mm_add_epi16(bl, _mm_add_epi16(cbl, cbl)); \
-+  bh = _mm_add_epi16(bh, _mm_add_epi16(cbh, cbh)); \
-+  bl = _mm_add_epi16(bl, yl); \
-+  bh = _mm_add_epi16(bh, yh); \
-+  \
-+  rl = _mm_mulhrs_epi16(crl, pw_f0402); \
-+  rh = _mm_mulhrs_epi16(crh, pw_f0402); \
-+  rl = _mm_add_epi16(rl, crl); \
-+  rh = _mm_add_epi16(rh, crh); \
-+  rl = _mm_add_epi16(rl, yl); \
-+  rh = _mm_add_epi16(rh, yh); \
-+  \
-+  g0w = _mm_unpacklo_epi16(cbl, crl); \
-+  g1w = _mm_unpackhi_epi16(cbl, crl); \
-+  g0 = _mm_add_epi32(_mm_madd_epi16(g0w, pw_mf0344_f0285), pd_onehalf); \
-+  g1 = _mm_add_epi32(_mm_madd_epi16(g1w, pw_mf0344_f0285), pd_onehalf); \
-+  g2w = _mm_unpacklo_epi16(cbh, crh); \
-+  g3w = _mm_unpackhi_epi16(cbh, crh); \
-+  g2 = _mm_add_epi32(_mm_madd_epi16(g2w, pw_mf0344_f0285), pd_onehalf); \
-+  g3 = _mm_add_epi32(_mm_madd_epi16(g3w, pw_mf0344_f0285), pd_onehalf); \
-+  \
-+  gl = _mm_packhi_epi32(g0, g1); \
-+  gh = _mm_packhi_epi32(g2, g3); \
-+  gl = _mm_sub_epi16(gl, crl); \
-+  gh = _mm_sub_epi16(gh, crh); \
-+  gl = _mm_add_epi16(gl, yl); \
-+  gh = _mm_add_epi16(gh, yh); \
-+  \
-+  rl = _mm_packus_epi16(rl, rh); \
-+  gl = _mm_packus_epi16(gl, gh); \
-+  bl = _mm_packus_epi16(bl, bh); \
-+  \
-+  rg0 = _mm_unpacklo_epi8(rl, gl); \
-+  rg1 = _mm_unpackhi_epi8(rl, gl); \
-+  bx0 = _mm_unpacklo_epi8(bl, pb_255); \
-+  bx1 = _mm_unpackhi_epi8(bl, pb_255); \
-+  \
-+  rgb0 = _mm_unpacklo_epi16(rg0, bx0); \
-+  rgb1 = _mm_unpackhi_epi16(rg0, bx0); \
-+  rgb2 = _mm_unpacklo_epi16(rg1, bx1); \
-+  rgb3 = _mm_unpackhi_epi16(rg1, bx1);
-+
-+#define CALC_MERGED1 \
-+  cbl = _mm_unpacklo_epi8(cb, pb_zero); \
-+  cbh = _mm_unpackhi_epi8(cb, pb_zero); \
-+  cbl = _mm_sub_epi16(cbl, pw_cj); \
-+  cbh = _mm_sub_epi16(cbh, pw_cj); \
-+  \
-+  crl = _mm_unpacklo_epi8(cr, pb_zero); \
-+  crh = _mm_unpackhi_epi8(cr, pb_zero); \
-+  crl = _mm_sub_epi16(crl, pw_cj); \
-+  crh = _mm_sub_epi16(crh, pw_cj); \
-+  \
-+  b_yl = _mm_mulhrs_epi16(cbl, pw_mf0228); \
-+  b_yh = _mm_mulhrs_epi16(cbh, pw_mf0228); \
-+  b_yl = _mm_add_epi16(b_yl, _mm_add_epi16(cbl, cbl)); \
-+  b_yh = _mm_add_epi16(b_yh, _mm_add_epi16(cbh, cbh)); \
-+  \
-+  r_yl = _mm_mulhrs_epi16(crl, pw_f0402); \
-+  r_yh = _mm_mulhrs_epi16(crh, pw_f0402); \
-+  r_yl = _mm_add_epi16(r_yl, crl); \
-+  r_yh = _mm_add_epi16(r_yh, crh); \
-+  \
-+  g_y0w = _mm_unpacklo_epi16(cbl, crl); \
-+  g_y1w = _mm_unpackhi_epi16(cbl, crl); \
-+  g_y0 = _mm_add_epi32(_mm_madd_epi16(g_y0w, pw_mf0344_f0285), pd_onehalf); \
-+  g_y1 = _mm_add_epi32(_mm_madd_epi16(g_y1w, pw_mf0344_f0285), pd_onehalf); \
-+  g_y2w = _mm_unpacklo_epi16(cbh, crh); \
-+  g_y3w = _mm_unpackhi_epi16(cbh, crh); \
-+  g_y2 = _mm_add_epi32(_mm_madd_epi16(g_y2w, pw_mf0344_f0285), pd_onehalf); \
-+  g_y3 = _mm_add_epi32(_mm_madd_epi16(g_y3w, pw_mf0344_f0285), pd_onehalf); \
-+  \
-+  g_yl = _mm_packhi_epi32(g_y0, g_y1); \
-+  g_yh = _mm_packhi_epi32(g_y2, g_y3); \
-+  g_yl = _mm_sub_epi16(g_yl, crl); \
-+  g_yh = _mm_sub_epi16(g_yh, crh);
-+
-+#define CALC_MERGED2(r_yl, g_yl, b_yl) \
-+  yl = _mm_unpacklo_epi8(y, pb_zero); \
-+  yh = _mm_unpackhi_epi8(y, pb_zero); \
-+  bl = _mm_add_epi16(_mm_unpacklo_epi16(b_yl, b_yl), yl); \
-+  bh = _mm_add_epi16(_mm_unpackhi_epi16(b_yl, b_yl), yh); \
-+  rl = _mm_add_epi16(_mm_unpacklo_epi16(r_yl, r_yl), yl); \
-+  rh = _mm_add_epi16(_mm_unpackhi_epi16(r_yl, r_yl), yh); \
-+  gl = _mm_add_epi16(_mm_unpacklo_epi16(g_yl, g_yl), yl); \
-+  gh = _mm_add_epi16(_mm_unpackhi_epi16(g_yl, g_yl), yh); \
-+  rl = _mm_packus_epi16(rl, rh); \
-+  gl = _mm_packus_epi16(gl, gh); \
-+  bl = _mm_packus_epi16(bl, bh); \
-+  \
-+  rg0 = _mm_unpacklo_epi8(rl, gl); \
-+  rg1 = _mm_unpackhi_epi8(rl, gl); \
-+  bx0 = _mm_unpacklo_epi8(bl, pb_255); \
-+  bx1 = _mm_unpackhi_epi8(bl, pb_255); \
-+  \
-+  rgb0 = _mm_unpacklo_epi16(rg0, bx0); \
-+  rgb1 = _mm_unpackhi_epi16(rg0, bx0); \
-+  rgb2 = _mm_unpacklo_epi16(rg1, bx1); \
-+  rgb3 = _mm_unpackhi_epi16(rg1, bx1);
-+
-+#define PIXELSIZE 3
-+#if defined(__iset__) && __iset__ >= 5
-+#define RGB_SHUFFLE_INIT __m128i \
-+  rgb_index0 = VEC_LD(ycc_rgb_shuf_const[shuf_idx]), \
-+  rgb_index1 = VEC_LD(ycc_rgb_shuf_const[shuf_idx] + 16), \
-+  rgb_index2 = VEC_LD(ycc_rgb_shuf_const[shuf_idx] + 32);
-+#define RGB_SHUFFLE \
-+  rgb0 = _mm_shuffle2_epi8(rgb0, rgb1, rgb_index0); \
-+  rgb1 = _mm_shuffle2_epi8(rgb1, rgb2, rgb_index1); \
-+  rgb2 = _mm_shuffle2_epi8(rgb2, rgb3, rgb_index2);
-+#else
-+#define RGB_SHUFFLE_INIT __m64 \
-+  rgb_index0 = *(__m64*)ycc_rgb_shuf_const[shuf_idx], \
-+  rgb_index1 = *(__m64*)(ycc_rgb_shuf_const[shuf_idx] + 8), \
-+  rgb_index2 = *(__m64*)(ycc_rgb_shuf_const[shuf_idx] + 16);
-+#define RGB_SHUFFLE { \
-+  union { __m128i v; __m64 d[2]; } a = { rgb0 }, \
-+    b = { rgb1 }, c = { rgb2 }, d = { rgb3 }; \
-+  a.d[0] = _mm_shuffle2_pi8(a.d[0], a.d[1], rgb_index0); \
-+  a.d[1] = _mm_shuffle2_pi8(a.d[1], b.d[0], rgb_index1); \
-+  b.d[0] = _mm_shuffle2_pi8(b.d[0], b.d[1], rgb_index2); \
-+  b.d[1] = _mm_shuffle2_pi8(c.d[0], c.d[1], rgb_index0); \
-+  c.d[0] = _mm_shuffle2_pi8(c.d[1], d.d[0], rgb_index1); \
-+  c.d[1] = _mm_shuffle2_pi8(d.d[0], d.d[1], rgb_index2); \
-+  rgb0 = a.v; rgb1 = b.v; rgb2 = c.v; \
-+}
-+#endif
-+
-+#define ycc_rgbn_convert jsimd_ycc_rgb3_convert_e2k
-+#define ycc_rgbn_merged  jsimd_ycc_rgb3_merged_upsample_e2k
-+#include "jdcolext-e2k.c"
-+
-+#define PIXELSIZE 4
-+#define RGB_SHUFFLE_INIT __m128i \
-+  rgb_index0 = VEC_LD(ycc_rgb_shuf_const[shuf_idx]);
-+#define RGB_SHUFFLE \
-+  rgb0 = _mm_shuffle_epi8(rgb0, rgb_index0); \
-+  rgb1 = _mm_shuffle_epi8(rgb1, rgb_index0); \
-+  rgb2 = _mm_shuffle_epi8(rgb2, rgb_index0); \
-+  rgb3 = _mm_shuffle_epi8(rgb3, rgb_index0);
-+
-+#define ycc_rgbn_convert jsimd_ycc_rgb4_convert_e2k
-+#define ycc_rgbn_merged  jsimd_ycc_rgb4_merged_upsample_e2k
-+#include "jdcolext-e2k.c"
-+
-diff --git a/simd/e2k/jdcoltab-e2k.c b/simd/e2k/jdcoltab-e2k.c
-new file mode 100644
-index 0000000..e19666d
---- /dev/null
-+++ b/simd/e2k/jdcoltab-e2k.c
-@@ -0,0 +1,80 @@
-+/*
-+ * Elbrus optimizations for libjpeg-turbo
-+ *
-+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
-+ *
-+ * This software is provided 'as-is', without any express or implied
-+ * warranty.  In no event will the authors be held liable for any damages
-+ * arising from the use of this software.
-+ *
-+ * Permission is granted to anyone to use this software for any purpose,
-+ * including commercial applications, and to alter it and redistribute it
-+ * freely, subject to the following restrictions:
-+ *
-+ * 1. The origin of this software must not be misrepresented; you must not
-+ *    claim that you wrote the original software. If you use this software
-+ *    in a product, an acknowledgment in the product documentation would be
-+ *    appreciated but is not required.
-+ * 2. Altered source versions must be plainly marked as such, and must not be
-+ *    misrepresented as being the original software.
-+ * 3. This notice may not be removed or altered from any source distribution.
-+ */
-+
-+/* This file is included by jdcolor-e2k.c */
-+
-+#if TMP_RED == 0
-+#define C0 0
-+#elif TMP_GREEN == 0
-+#define C0 1
-+#elif TMP_BLUE == 0
-+#define C0 2
-+#else
-+#define C0 3
-+#endif
-+
-+#if TMP_RED == 1
-+#define C1 0
-+#elif TMP_GREEN == 1
-+#define C1 1
-+#elif TMP_BLUE == 1
-+#define C1 2
-+#else
-+#define C1 3
-+#endif
-+
-+#if TMP_RED == 2
-+#define C2 0
-+#elif TMP_GREEN == 2
-+#define C2 1
-+#elif TMP_BLUE == 2
-+#define C2 2
-+#else
-+#define C2 3
-+#endif
-+
-+#if TMP_RED == 3
-+#define C3 0
-+#elif TMP_GREEN == 3
-+#define C3 1
-+#elif TMP_BLUE == 3
-+#define C3 2
-+#else
-+#define C3 3
-+#endif
-+
-+#if PIXELSIZE == 3
-+{ SHUF_CONST3 }
-+#else
-+{ SHUF_CONST4 }
-+#endif
-+
-+#undef C0
-+#undef C1
-+#undef C2
-+#undef C3
-+
-+#undef TMP_RED
-+#undef TMP_GREEN
-+#undef TMP_BLUE
-+#undef PIXELSIZE
-+
-diff --git a/simd/e2k/jdsample-e2k.c b/simd/e2k/jdsample-e2k.c
-new file mode 100644
-index 0000000..572b3af
---- /dev/null
-+++ b/simd/e2k/jdsample-e2k.c
-@@ -0,0 +1,389 @@
-+/*
-+ * Elbrus optimizations for libjpeg-turbo
-+ *
-+ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
-+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
-+ *
-+ * This software is provided 'as-is', without any express or implied
-+ * warranty.  In no event will the authors be held liable for any damages
-+ * arising from the use of this software.
-+ *
-+ * Permission is granted to anyone to use this software for any purpose,
-+ * including commercial applications, and to alter it and redistribute it
-+ * freely, subject to the following restrictions:
-+ *
-+ * 1. The origin of this software must not be misrepresented; you must not
-+ *    claim that you wrote the original software. If you use this software
-+ *    in a product, an acknowledgment in the product documentation would be
-+ *    appreciated but is not required.
-+ * 2. Altered source versions must be plainly marked as such, and must not be
-+ *    misrepresented as being the original software.
-+ * 3. This notice may not be removed or altered from any source distribution.
-+ */
-+
-+/* CHROMA UPSAMPLING */
-+
-+#include "jsimd_e2k.h"
-+
-+
-+void jsimd_h2v1_fancy_upsample_e2k(int max_v_samp_factor,
-+                                   JDIMENSION downsampled_width,
-+                                   JSAMPARRAY input_data,
-+                                   JSAMPARRAY *output_data_ptr)
-+{
-+  JSAMPARRAY output_data = *output_data_ptr;
-+  JSAMPROW inptr, outptr;
-+  int inrow, incol;
-+
-+  __m128i pb_zero = _mm_setzero_si128();
-+  __m128i this0, last0, p_last0, next0 = pb_zero, p_next0, out;
-+  __m128i this0l, this0h, last0l, last0h,
-+    next0l, next0h, outle, outhe, outlo, outho;
-+#ifdef NEED_ALIGN8
-+  ALIGN8_COMMON
-+  ALIGN8_VARS(src)
-+#endif
-+
-+  /* Constants */
-+  __m128i pw_three = _mm_set1_epi16(3),
-+    next_index_lastcol = _mm_setr_epi8(
-+       1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15),
-+    pw_one = _mm_set1_epi16(1), pw_two = _mm_set1_epi16(2);
-+
-+  if (downsampled_width > 0)
-+  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
-+    inptr = input_data[inrow];
-+    outptr = output_data[inrow];
-+
-+    if (downsampled_width & 15)
-+      inptr[downsampled_width] = inptr[downsampled_width - 1];
-+
-+#ifdef NEED_ALIGN8
-+    ALIGN8_START(inptr, src)
-+    ALIGN8_READ16(this0, src, 0)
-+#else
-+    this0 = VEC_LD(inptr);
-+#endif
-+    last0 = _mm_bslli_si128(this0, 15);
-+
-+    PRAGMA_E2K("ivdep")
-+    for (incol = downsampled_width; incol > 0;
-+         incol -= 16, outptr += 32) {
-+
-+      p_last0 = _mm_alignr_epi8(this0, last0, 15);
-+      last0 = this0;
-+
-+      if (__builtin_expect(incol <= 16, 0))
-+        p_next0 = _mm_shuffle_epi8(this0, next_index_lastcol);
-+      else {
-+#ifdef NEED_ALIGN8
-+        ALIGN8_READ16(next0, src, 1) src_ptr += 2;
-+#else
-+        next0 = VEC_LD(inptr + 16); inptr += 16;
-+#endif
-+        p_next0 = _mm_alignr_epi8(next0, this0, 1);
-+      }
-+
-+      this0l = _mm_mullo_epi16(_mm_unpacklo_epi8(this0, pb_zero), pw_three);
-+      last0l = _mm_unpacklo_epi8(p_last0, pb_zero);
-+      next0l = _mm_unpacklo_epi8(p_next0, pb_zero);
-+      last0l = _mm_add_epi16(last0l, pw_one);
-+      next0l = _mm_add_epi16(next0l, pw_two);
-+
-+      outle = _mm_add_epi16(this0l, last0l);
-+      outlo = _mm_add_epi16(this0l, next0l);
-+      outle = _mm_srli_epi16(outle, 2);
-+      outlo = _mm_srli_epi16(outlo, 2);
-+
-+      out = _mm_or_si128(outle, _mm_slli_epi16(outlo, 8));
-+      VEC_ST(outptr, out);
-+
-+      if (__builtin_expect(incol <= 8, 0)) break;
-+
-+      this0h = _mm_mullo_epi16(_mm_unpackhi_epi8(this0, pb_zero), pw_three);
-+      last0h = _mm_unpackhi_epi8(p_last0, pb_zero);
-+      next0h = _mm_unpackhi_epi8(p_next0, pb_zero);
-+      last0h = _mm_add_epi16(last0h, pw_one);
-+      next0h = _mm_add_epi16(next0h, pw_two);
-+
-+      outhe = _mm_add_epi16(this0h, last0h);
-+      outho = _mm_add_epi16(this0h, next0h);
-+      outhe = _mm_srli_epi16(outhe, 2);
-+      outho = _mm_srli_epi16(outho, 2);
-+
-+      out = _mm_or_si128(outhe, _mm_slli_epi16(outho, 8));
-+      VEC_ST(outptr + 16, out);
-+
-+      this0 = next0;
-+    }
-+  }
-+}
-+
-+
-+void jsimd_h2v2_fancy_upsample_e2k(int max_v_samp_factor,
-+                                   JDIMENSION downsampled_width,
-+                                   JSAMPARRAY input_data,
-+                                   JSAMPARRAY *output_data_ptr)
-+{
-+  JSAMPARRAY output_data = *output_data_ptr;
-+  JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
-+  int inrow, outrow, incol;
-+#ifdef NEED_ALIGN8
-+  ALIGN8_COMMON
-+  ALIGN8_VARS(src_1)
-+  ALIGN8_VARS(src0)
-+  ALIGN8_VARS(src1)
-+#endif
-+
-+  __m128i pb_zero = _mm_setzero_si128();
-+  __m128i this_1, this0, this1, out;
-+  __m128i this_1l, this_1h, this0l, this0h, this1l, this1h,
-+    lastcolsum_1h, lastcolsum1h,
-+    p_lastcolsum_1l, p_lastcolsum_1h, p_lastcolsum1l, p_lastcolsum1h,
-+    thiscolsum_1l, thiscolsum_1h, thiscolsum1l, thiscolsum1h,
-+    nextcolsum_1l = pb_zero, nextcolsum_1h = pb_zero,
-+    nextcolsum1l = pb_zero, nextcolsum1h = pb_zero,
-+    p_nextcolsum_1l, p_nextcolsum_1h, p_nextcolsum1l, p_nextcolsum1h,
-+    tmpl, tmph, outle, outhe, outlo, outho;
-+
-+  /* Constants */
-+  __m128i pw_three = _mm_set1_epi16(3),
-+    pw_seven = _mm_set1_epi16(7), pw_eight = _mm_set1_epi16(8),
-+    next_index_lastcol = _mm_setr_epi8(
-+       2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 14, 15);
-+
-+  if (downsampled_width > 0)
-+  for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
-+
-+    inptr_1 = input_data[inrow - 1];
-+    inptr0 = input_data[inrow];
-+    inptr1 = input_data[inrow + 1];
-+    outptr0 = output_data[outrow++];
-+    outptr1 = output_data[outrow++];
-+
-+    if (downsampled_width & 15) {
-+      inptr_1[downsampled_width] = inptr_1[downsampled_width - 1];
-+      inptr0[downsampled_width] = inptr0[downsampled_width - 1];
-+      inptr1[downsampled_width] = inptr1[downsampled_width - 1];
-+    }
-+
-+#ifdef NEED_ALIGN8
-+    ALIGN8_START(inptr0, src0)
-+    ALIGN8_START(inptr_1, src_1)
-+    ALIGN8_START(inptr1, src1)
-+    ALIGN8_READ16(this0, src0, 0)
-+    ALIGN8_READ16(this_1, src_1, 0)
-+    ALIGN8_READ16(this1, src1, 0)
-+#else
-+    this0 = VEC_LD(inptr0);
-+    this_1 = VEC_LD(inptr_1);
-+    this1 = VEC_LD(inptr1);
-+#endif
-+
-+    this0l = _mm_unpacklo_epi8(this0, pb_zero);
-+    this0h = _mm_unpackhi_epi8(this0, pb_zero);
-+    this0l = _mm_mullo_epi16(this0l, pw_three);
-+    this0h = _mm_mullo_epi16(this0h, pw_three);
-+
-+    this_1l = _mm_unpacklo_epi8(this_1, pb_zero);
-+    this_1h = _mm_unpackhi_epi8(this_1, pb_zero);
-+    thiscolsum_1l = _mm_add_epi16(this0l, this_1l);
-+    thiscolsum_1h = _mm_add_epi16(this0h, this_1h);
-+    lastcolsum_1h = _mm_bslli_si128(thiscolsum_1l, 14);;
-+
-+    this1l = _mm_unpacklo_epi8(this1, pb_zero);
-+    this1h = _mm_unpackhi_epi8(this1, pb_zero);
-+    thiscolsum1l = _mm_add_epi16(this0l, this1l);
-+    thiscolsum1h = _mm_add_epi16(this0h, this1h);
-+    lastcolsum1h = _mm_bslli_si128(thiscolsum1l, 14);
-+
-+    PRAGMA_E2K("ivdep")
-+    for (incol = downsampled_width; incol > 0;
-+         incol -= 16, outptr0 += 32, outptr1 += 32) {
-+
-+      p_lastcolsum_1l = _mm_alignr_epi8(thiscolsum_1l, lastcolsum_1h, 14);
-+      p_lastcolsum_1h = _mm_alignr_epi8(thiscolsum_1h, thiscolsum_1l, 14);
-+      p_lastcolsum1l = _mm_alignr_epi8(thiscolsum1l, lastcolsum1h, 14);
-+      p_lastcolsum1h = _mm_alignr_epi8(thiscolsum1h, thiscolsum1l, 14);
-+      lastcolsum_1h = thiscolsum_1h;
-+      lastcolsum1h = thiscolsum1h;
-+
-+      if (__builtin_expect(incol <= 16, 0)) {
-+        p_nextcolsum_1l = _mm_alignr_epi8(thiscolsum_1h, thiscolsum_1l, 2);
-+        p_nextcolsum_1h = _mm_shuffle_epi8(thiscolsum_1h, next_index_lastcol);
-+        p_nextcolsum1l = _mm_alignr_epi8(thiscolsum1h, thiscolsum1l, 2);
-+        p_nextcolsum1h = _mm_shuffle_epi8(thiscolsum1h, next_index_lastcol);
-+      } else {
-+#ifdef NEED_ALIGN8
-+        ALIGN8_READ16(this0, src0, 1) src0_ptr += 2;
-+        ALIGN8_READ16(this_1, src_1, 1) src_1_ptr += 2;
-+        ALIGN8_READ16(this1, src1, 1) src1_ptr += 2;
-+#else
-+        this0 = VEC_LD(inptr0 + 16); inptr0 += 16;
-+        this_1 = VEC_LD(inptr_1 + 16); inptr_1 += 16;
-+        this1 = VEC_LD(inptr1 + 16); inptr1 += 16;
-+#endif
-+        this0l = _mm_unpacklo_epi8(this0, pb_zero);
-+        this0h = _mm_unpackhi_epi8(this0, pb_zero);
-+        this0l = _mm_mullo_epi16(this0l, pw_three);
-+        this0h = _mm_mullo_epi16(this0h, pw_three);
-+
-+        this_1l = _mm_unpacklo_epi8(this_1, pb_zero);
-+        this_1h = _mm_unpackhi_epi8(this_1, pb_zero);
-+        nextcolsum_1l = _mm_add_epi16(this0l, this_1l);
-+        nextcolsum_1h = _mm_add_epi16(this0h, this_1h);
-+        p_nextcolsum_1l = _mm_alignr_epi8(thiscolsum_1h, thiscolsum_1l, 2);
-+        p_nextcolsum_1h = _mm_alignr_epi8(nextcolsum_1l, thiscolsum_1h, 2);
-+
-+        this1l = _mm_unpacklo_epi8(this1, pb_zero);
-+        this1h = _mm_unpackhi_epi8(this1, pb_zero);
-+        nextcolsum1l = _mm_add_epi16(this0l, this1l);
-+        nextcolsum1h = _mm_add_epi16(this0h, this1h);
-+        p_nextcolsum1l = _mm_alignr_epi8(thiscolsum1h, thiscolsum1l, 2);
-+        p_nextcolsum1h = _mm_alignr_epi8(nextcolsum1l, thiscolsum1h, 2);
-+      }
-+
-+      /* Process the upper row */
-+
-+      tmpl = _mm_mullo_epi16(thiscolsum_1l, pw_three);
-+      outle = _mm_add_epi16(tmpl, p_lastcolsum_1l);
-+      outle = _mm_add_epi16(outle, pw_eight);
-+      outle = _mm_srli_epi16(outle, 4);
-+
-+      outlo = _mm_add_epi16(tmpl, p_nextcolsum_1l);
-+      outlo = _mm_add_epi16(outlo, pw_seven);
-+      outlo = _mm_srli_epi16(outlo, 4);
-+
-+      out = _mm_or_si128(outle, _mm_slli_epi16(outlo, 8));
-+      VEC_ST(outptr0, out);
-+
-+      /* Process the lower row */
-+
-+      tmpl = _mm_mullo_epi16(thiscolsum1l, pw_three);
-+      outle = _mm_add_epi16(tmpl, p_lastcolsum1l);
-+      outle = _mm_add_epi16(outle, pw_eight);
-+      outle = _mm_srli_epi16(outle, 4);
-+
-+      outlo = _mm_add_epi16(tmpl, p_nextcolsum1l);
-+      outlo = _mm_add_epi16(outlo, pw_seven);
-+      outlo = _mm_srli_epi16(outlo, 4);
-+
-+      out = _mm_or_si128(outle, _mm_slli_epi16(outlo, 8));
-+      VEC_ST(outptr1, out);
-+
-+      if (__builtin_expect(incol <= 8, 0)) break;
-+
-+      tmph = _mm_mullo_epi16(thiscolsum_1h, pw_three);
-+      outhe = _mm_add_epi16(tmph, p_lastcolsum_1h);
-+      outhe = _mm_add_epi16(outhe, pw_eight);
-+      outhe = _mm_srli_epi16(outhe, 4);
-+
-+      outho = _mm_add_epi16(tmph, p_nextcolsum_1h);
-+      outho = _mm_add_epi16(outho, pw_seven);
-+      outho = _mm_srli_epi16(outho, 4);
-+
-+      out = _mm_or_si128(outhe, _mm_slli_epi16(outho, 8));
-+      VEC_ST(outptr0 + 16, out);
-+
-+      tmph = _mm_mullo_epi16(thiscolsum1h, pw_three);
-+      outhe = _mm_add_epi16(tmph, p_lastcolsum1h);
-+      outhe = _mm_add_epi16(outhe, pw_eight);
-+      outhe = _mm_srli_epi16(outhe, 4);
-+
-+      outho = _mm_add_epi16(tmph, p_nextcolsum1h);
-+      outho = _mm_add_epi16(outho, pw_seven);
-+      outho = _mm_srli_epi16(outho, 4);
-+
-+      out = _mm_or_si128(outhe, _mm_slli_epi16(outho, 8));
-+      VEC_ST(outptr1 + 16, out);
-+
-+      thiscolsum_1l = nextcolsum_1l;  thiscolsum_1h = nextcolsum_1h;
-+      thiscolsum1l = nextcolsum1l;  thiscolsum1h = nextcolsum1h;
-+    }
-+  }
-+}
-+
-+
-+/* These are rarely used (mainly just for decompressing YCCK images) */
-+
-+void jsimd_h2v1_upsample_e2k(int max_v_samp_factor,
-+                             JDIMENSION out_width,
-+                             JSAMPARRAY input_data,
-+                             JSAMPARRAY *output_data_ptr)
-+{
-+  JSAMPARRAY output_data = *output_data_ptr;
-+  JSAMPROW inptr, outptr;
-+  int inrow, incol;
-+
-+  __m128i in, inl, inh;
-+#ifdef NEED_ALIGN8
-+  ALIGN8_COMMON
-+  ALIGN8_VARS(src)
-+#endif
-+
-+  if (out_width > 0)
-+  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
-+    inptr = input_data[inrow];
-+    outptr = output_data[inrow];
-+#ifdef NEED_ALIGN8
-+    ALIGN8_START(inptr, src)
-+#endif
-+    PRAGMA_E2K("ivdep")
-+    for (incol = out_width; incol > 0;
-+         incol -= 32, outptr += 32) {
-+#ifdef NEED_ALIGN8
-+      ALIGN8_READ16(in, src, 0) src_ptr += 2;
-+#else
-+      in = VEC_LD(inptr); inptr += 16;
-+#endif
-+      inl = _mm_unpacklo_epi8(in, in);
-+      inh = _mm_unpackhi_epi8(in, in);
-+
-+      VEC_ST(outptr, inl);
-+      VEC_ST(outptr + 16, inh);
-+    }
-+  }
-+}
-+
-+
-+void jsimd_h2v2_upsample_e2k(int max_v_samp_factor,
-+                             JDIMENSION out_width,
-+                             JSAMPARRAY input_data,
-+                             JSAMPARRAY *output_data_ptr)
-+{
-+  JSAMPARRAY output_data = *output_data_ptr;
-+  JSAMPROW inptr, outptr0, outptr1;
-+  int inrow, outrow, incol;
-+
-+  __m128i in, inl, inh;
-+#ifdef NEED_ALIGN8
-+  ALIGN8_COMMON
-+  ALIGN8_VARS(src)
-+#endif
-+
-+  if (out_width > 0)
-+  for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
-+    inptr = input_data[inrow];
-+    outptr0 = output_data[outrow++];
-+    outptr1 = output_data[outrow++];
-+#ifdef NEED_ALIGN8
-+    ALIGN8_START(inptr, src)
-+#endif
-+    PRAGMA_E2K("ivdep")
-+    for (incol = out_width; incol > 0;
-+         incol -= 32, outptr0 += 32, outptr1 += 32) {
-+#ifdef NEED_ALIGN8
-+      ALIGN8_READ16(in, src, 0) src_ptr += 2;
-+#else
-+      in = VEC_LD(inptr); inptr += 16;
-+#endif
-+      inl = _mm_unpacklo_epi8(in, in);
-+      inh = _mm_unpackhi_epi8(in, in);
-+
-+      VEC_ST(outptr0, inl);
-+      VEC_ST(outptr1, inl);
-+      VEC_ST(outptr0 + 16, inh);
-+      VEC_ST(outptr1 + 16, inh);
-+    }
-+  }
-+}
-diff --git a/simd/e2k/jfdctflt-e2k.c b/simd/e2k/jfdctflt-e2k.c
-new file mode 100644
-index 0000000..e3c4d94
---- /dev/null
-+++ b/simd/e2k/jfdctflt-e2k.c
-@@ -0,0 +1,127 @@
-+/*
-+ * Elbrus optimizations for libjpeg-turbo
-+ *
-+ * Copyright (C) 2014, D. R. Commander.  All Rights Reserved.
-+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
-+ *
-+ * This software is provided 'as-is', without any express or implied
-+ * warranty.  In no event will the authors be held liable for any damages
-+ * arising from the use of this software.
-+ *
-+ * Permission is granted to anyone to use this software for any purpose,
-+ * including commercial applications, and to alter it and redistribute it
-+ * freely, subject to the following restrictions:
-+ *
-+ * 1. The origin of this software must not be misrepresented; you must not
-+ *    claim that you wrote the original software. If you use this software
-+ *    in a product, an acknowledgment in the product documentation would be
-+ *    appreciated but is not required.
-+ * 2. Altered source versions must be plainly marked as such, and must not be
-+ *    misrepresented as being the original software.
-+ * 3. This notice may not be removed or altered from any source distribution.
-+ */
-+
-+/* FLOAT FORWARD DCT */
-+
-+#include "jsimd_e2k.h"
-+
-+#define DO_FDCT(in, out) { \
-+  tmp0 = _mm_add_ps(in##0, in##7); \
-+  tmp7 = _mm_sub_ps(in##0, in##7); \
-+  tmp1 = _mm_add_ps(in##1, in##6); \
-+  tmp6 = _mm_sub_ps(in##1, in##6); \
-+  tmp2 = _mm_add_ps(in##2, in##5); \
-+  tmp5 = _mm_sub_ps(in##2, in##5); \
-+  tmp3 = _mm_add_ps(in##3, in##4); \
-+  tmp4 = _mm_sub_ps(in##3, in##4); \
-+  \
-+  /* Even part */ \
-+  \
-+  tmp10 = _mm_add_ps(tmp0, tmp3); \
-+  tmp13 = _mm_sub_ps(tmp0, tmp3); \
-+  tmp11 = _mm_add_ps(tmp1, tmp2); \
-+  tmp12 = _mm_sub_ps(tmp1, tmp2); \
-+  \
-+  out##0 = _mm_add_ps(tmp10, tmp11); \
-+  out##4 = _mm_sub_ps(tmp10, tmp11); \
-+  \
-+  z1 = _mm_mul_ps(_mm_add_ps(tmp12, tmp13), pd_f0707); \
-+  out##2 = _mm_add_ps(tmp13, z1); \
-+  out##6 = _mm_sub_ps(tmp13, z1); \
-+  \
-+  /* Odd part */ \
-+  \
-+  tmp10 = _mm_add_ps(tmp4, tmp5); \
-+  tmp11 = _mm_add_ps(tmp5, tmp6); \
-+  tmp12 = _mm_add_ps(tmp6, tmp7); \
-+  \
-+  z5 = _mm_mul_ps(_mm_sub_ps(tmp10, tmp12), pd_f0382); \
-+  z2 = _mm_add_ps(_mm_mul_ps(tmp10, pd_f0541), z5); \
-+  z4 = _mm_add_ps(_mm_mul_ps(tmp12, pd_f1306), z5); \
-+  z3 = _mm_mul_ps(tmp11, pd_f0707); \
-+  \
-+  z11 = _mm_add_ps(tmp7, z3); \
-+  z13 = _mm_sub_ps(tmp7, z3); \
-+  \
-+  out##5 = _mm_add_ps(z13, z2); \
-+  out##3 = _mm_sub_ps(z13, z2); \
-+  out##1 = _mm_add_ps(z11, z4); \
-+  out##7 = _mm_sub_ps(z11, z4); \
-+}
-+
-+#define LOAD_DATA(a, b, c, d, l, i) \
-+  l##a = _mm_loadu_ps(data + a * 8 + i); \
-+  l##b = _mm_loadu_ps(data + b * 8 + i); \
-+  l##c = _mm_loadu_ps(data + c * 8 + i); \
-+  l##d = _mm_loadu_ps(data + d * 8 + i);
-+
-+#define STORE_DATA(a, b, c, d, l, i) \
-+  _mm_storeu_ps(data + a * 8 + i, l##a); \
-+  _mm_storeu_ps(data + b * 8 + i, l##b); \
-+  _mm_storeu_ps(data + c * 8 + i, l##c); \
-+  _mm_storeu_ps(data + d * 8 + i, l##d);
-+
-+
-+void jsimd_fdct_float_e2k(FAST_FLOAT *data)
-+{
-+  __m128 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
-+    tmp10, tmp11, tmp12, tmp13, z1, z2, z3, z4, z5, z11, z13;
-+  __m128 l0, l1, l2, l3, l4, l5, l6, l7;
-+  __m128 h0, h1, h2, h3, h4, h5, h6, h7;
-+  __m128 x0, x1, x2, x3, x4, x5, x6, x7;
-+  __m128 y0, y1, y2, y3, y4, y5, y6, y7;
-+
-+  /* Constants */
-+  __m128 pd_f0382 = _mm_set1_ps(0.382683433f),
-+    pd_f0541 = _mm_set1_ps(0.541196100f),
-+    pd_f0707 = _mm_set1_ps(0.707106781f),
-+    pd_f1306 = _mm_set1_ps(1.306562965f);
-+
-+  /* Pass 1: process columns */
-+
-+  LOAD_DATA(0, 1, 2, 3, x, 0)
-+  LOAD_DATA(0, 1, 2, 3, y, 4)
-+  TRANSPOSE_FLOAT(x0, x1, x2, x3, l0, l1, l2, l3)
-+  TRANSPOSE_FLOAT(y0, y1, y2, y3, l4, l5, l6, l7)
-+  DO_FDCT(l, l);
-+
-+  LOAD_DATA(4, 5, 6, 7, x, 0)
-+  LOAD_DATA(4, 5, 6, 7, y, 4)
-+  TRANSPOSE_FLOAT(x4, x5, x6, x7, h0, h1, h2, h3)
-+  TRANSPOSE_FLOAT(y4, y5, y6, y7, h4, h5, h6, h7)
-+  DO_FDCT(h, h);
-+
-+  /* Pass 2: process rows */
-+
-+  TRANSPOSE_FLOAT(l0, l1, l2, l3, x0, x1, x2, x3)
-+  TRANSPOSE_FLOAT(h0, h1, h2, h3, x4, x5, x6, x7)
-+  DO_FDCT(x, x);
-+  STORE_DATA(0, 1, 2, 3, x, 0)
-+  STORE_DATA(4, 5, 6, 7, x, 0)
-+
-+  TRANSPOSE_FLOAT(l4, l5, l6, l7, y0, y1, y2, y3)
-+  TRANSPOSE_FLOAT(h4, h5, h6, h7, y4, y5, y6, y7)
-+  DO_FDCT(y, y);
-+  STORE_DATA(0, 1, 2, 3, y, 4)
-+  STORE_DATA(4, 5, 6, 7, y, 4)
-+}
-diff --git a/simd/e2k/jfdctfst-e2k.c b/simd/e2k/jfdctfst-e2k.c
-new file mode 100644
-index 0000000..9e58f05
---- /dev/null
-+++ b/simd/e2k/jfdctfst-e2k.c
-@@ -0,0 +1,145 @@
-+/*
-+ * Elbrus optimizations for libjpeg-turbo
-+ *
-+ * Copyright (C) 2014, D. R. Commander.  All Rights Reserved.
-+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
-+ *
-+ * This software is provided 'as-is', without any express or implied
-+ * warranty.  In no event will the authors be held liable for any damages
-+ * arising from the use of this software.
-+ *
-+ * Permission is granted to anyone to use this software for any purpose,
-+ * including commercial applications, and to alter it and redistribute it
-+ * freely, subject to the following restrictions:
-+ *
-+ * 1. The origin of this software must not be misrepresented; you must not
-+ *    claim that you wrote the original software. If you use this software
-+ *    in a product, an acknowledgment in the product documentation would be
-+ *    appreciated but is not required.
-+ * 2. Altered source versions must be plainly marked as such, and must not be
-+ *    misrepresented as being the original software.
-+ * 3. This notice may not be removed or altered from any source distribution.
-+ */
-+
-+/* FAST INTEGER FORWARD DCT */
-+
-+#include "jsimd_e2k.h"
-+
-+
-+#define F_0_382  98   /* FIX(0.382683433) */
-+#define F_0_541  139  /* FIX(0.541196100) */
-+#define F_0_707  181  /* FIX(0.707106781) */
-+#define F_1_306  334  /* FIX(1.306562965) */
-+
-+#define CONST_BITS  8
-+#define PRE_MULTIPLY_SCALE_BITS  2
-+#define CONST_SHIFT  (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
-+
-+
-+#define DO_FDCT() { \
-+  /* Even part */ \
-+  \
-+  tmp10 = _mm_add_epi16(tmp0, tmp3); \
-+  tmp13 = _mm_sub_epi16(tmp0, tmp3); \
-+  tmp11 = _mm_add_epi16(tmp1, tmp2); \
-+  tmp12 = _mm_sub_epi16(tmp1, tmp2); \
-+  \
-+  out0  = _mm_add_epi16(tmp10, tmp11); \
-+  out4  = _mm_sub_epi16(tmp10, tmp11); \
-+  \
-+  z1 = _mm_add_epi16(tmp12, tmp13); \
-+  z1 = _mm_slli_epi16(z1, PRE_MULTIPLY_SCALE_BITS); \
-+  z1 = _mm_mulhi_epi16(z1, pw_0707); \
-+  \
-+  out2 = _mm_add_epi16(tmp13, z1); \
-+  out6 = _mm_sub_epi16(tmp13, z1); \
-+  \
-+  /* Odd part */ \
-+  \
-+  tmp10 = _mm_add_epi16(tmp4, tmp5); \
-+  tmp11 = _mm_add_epi16(tmp5, tmp6); \
-+  tmp12 = _mm_add_epi16(tmp6, tmp7); \
-+  \
-+  tmp10 = _mm_slli_epi16(tmp10, PRE_MULTIPLY_SCALE_BITS); \
-+  tmp12 = _mm_slli_epi16(tmp12, PRE_MULTIPLY_SCALE_BITS); \
-+  z5 = _mm_sub_epi16(tmp10, tmp12); \
-+  z5 = _mm_mulhi_epi16(z5, pw_0382); \
-+  \
-+  z2 = _mm_add_epi16(_mm_mulhi_epi16(tmp10, pw_0541), z5); \
-+  z4 = _mm_add_epi16(_mm_mulhi_epi16(tmp12, pw_1306), z5); \
-+  \
-+  tmp11 = _mm_slli_epi16(tmp11, PRE_MULTIPLY_SCALE_BITS); \
-+  z3 = _mm_mulhi_epi16(tmp11, pw_0707); \
-+  \
-+  z11 = _mm_add_epi16(tmp7, z3); \
-+  z13 = _mm_sub_epi16(tmp7, z3); \
-+  \
-+  out5 = _mm_add_epi16(z13, z2); \
-+  out3 = _mm_sub_epi16(z13, z2); \
-+  out1 = _mm_add_epi16(z11, z4); \
-+  out7 = _mm_sub_epi16(z11, z4); \
-+}
-+
-+
-+void jsimd_fdct_ifast_e2k(DCTELEM *data)
-+{
-+  __m128i row0, row1, row2, row3, row4, row5, row6, row7,
-+    col0, col1, col2, col3, col4, col5, col6, col7,
-+    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
-+    z1, z2, z3, z4, z5, z11, z13,
-+    out0, out1, out2, out3, out4, out5, out6, out7;
-+
-+  /* Constants */
-+  __m128i pw_0382 = _mm_set1_epi16(F_0_382 << CONST_SHIFT),
-+    pw_0541 = _mm_set1_epi16(F_0_541 << CONST_SHIFT),
-+    pw_0707 = _mm_set1_epi16(F_0_707 << CONST_SHIFT),
-+    pw_1306 = _mm_set1_epi16(F_1_306 << CONST_SHIFT);
-+
-+  /* Pass 1: process rows */
-+
-+  row0 = VEC_LD(data + 0 * 8);
-+  row1 = VEC_LD(data + 1 * 8);
-+  row2 = VEC_LD(data + 2 * 8);
-+  row3 = VEC_LD(data + 3 * 8);
-+  row4 = VEC_LD(data + 4 * 8);
-+  row5 = VEC_LD(data + 5 * 8);
-+  row6 = VEC_LD(data + 6 * 8);
-+  row7 = VEC_LD(data + 7 * 8);
-+
-+  TRANSPOSE(row, col);
-+
-+  tmp0 = _mm_add_epi16(col0, col7);
-+  tmp7 = _mm_sub_epi16(col0, col7);
-+  tmp1 = _mm_add_epi16(col1, col6);
-+  tmp6 = _mm_sub_epi16(col1, col6);
-+  tmp2 = _mm_add_epi16(col2, col5);
-+  tmp5 = _mm_sub_epi16(col2, col5);
-+  tmp3 = _mm_add_epi16(col3, col4);
-+  tmp4 = _mm_sub_epi16(col3, col4);
-+
-+  DO_FDCT();
-+
-+  /* Pass 2: process columns */
-+
-+  TRANSPOSE(out, row);
-+
-+  tmp0 = _mm_add_epi16(row0, row7);
-+  tmp7 = _mm_sub_epi16(row0, row7);
-+  tmp1 = _mm_add_epi16(row1, row6);
-+  tmp6 = _mm_sub_epi16(row1, row6);
-+  tmp2 = _mm_add_epi16(row2, row5);
-+  tmp5 = _mm_sub_epi16(row2, row5);
-+  tmp3 = _mm_add_epi16(row3, row4);
-+  tmp4 = _mm_sub_epi16(row3, row4);
-+
-+  DO_FDCT();
-+
-+  VEC_ST(data + 0 * 8, out0);
-+  VEC_ST(data + 1 * 8, out1);
-+  VEC_ST(data + 2 * 8, out2);
-+  VEC_ST(data + 3 * 8, out3);
-+  VEC_ST(data + 4 * 8, out4);
-+  VEC_ST(data + 5 * 8, out5);
-+  VEC_ST(data + 6 * 8, out6);
-+  VEC_ST(data + 7 * 8, out7);
-+}
-diff --git a/simd/e2k/jfdctint-e2k.c b/simd/e2k/jfdctint-e2k.c
-new file mode 100644
-index 0000000..2200852
---- /dev/null
-+++ b/simd/e2k/jfdctint-e2k.c
-@@ -0,0 +1,255 @@
-+/*
-+ * Elbrus optimizations for libjpeg-turbo
-+ *
-+ * Copyright (C) 2014, 2020, D. R. Commander.  All Rights Reserved.
-+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
-+ *
-+ * This software is provided 'as-is', without any express or implied
-+ * warranty.  In no event will the authors be held liable for any damages
-+ * arising from the use of this software.
-+ *
-+ * Permission is granted to anyone to use this software for any purpose,
-+ * including commercial applications, and to alter it and redistribute it
-+ * freely, subject to the following restrictions:
-+ *
-+ * 1. The origin of this software must not be misrepresented; you must not
-+ *    claim that you wrote the original software. If you use this software
-+ *    in a product, an acknowledgment in the product documentation would be
-+ *    appreciated but is not required.
-+ * 2. Altered source versions must be plainly marked as such, and must not be
-+ *    misrepresented as being the original software.
-+ * 3. This notice may not be removed or altered from any source distribution.
-+ */
-+
-+/* ACCURATE INTEGER FORWARD DCT */
-+
-+#include "jsimd_e2k.h"
-+
-+
-+#define F_0_298  2446   /* FIX(0.298631336) */
-+#define F_0_390  3196   /* FIX(0.390180644) */
-+#define F_0_541  4433   /* FIX(0.541196100) */
-+#define F_0_765  6270   /* FIX(0.765366865) */
-+#define F_0_899  7373   /* FIX(0.899976223) */
-+#define F_1_175  9633   /* FIX(1.175875602) */
-+#define F_1_501  12299  /* FIX(1.501321110) */
-+#define F_1_847  15137  /* FIX(1.847759065) */
-+#define F_1_961  16069  /* FIX(1.961570560) */
-+#define F_2_053  16819  /* FIX(2.053119869) */
-+#define F_2_562  20995  /* FIX(2.562915447) */
-+#define F_3_072  25172  /* FIX(3.072711026) */
-+
-+#define CONST_BITS  13
-+#define PASS1_BITS  2
-+#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
-+#define DESCALE_P2  (CONST_BITS + PASS1_BITS)
-+
-+
-+#define DO_FDCT_COMMON(PASS) { \
-+  /* (Original) \
-+   * z1 = (tmp12 + tmp13) * 0.541196100; \
-+   * data2 = z1 + tmp13 * 0.765366865; \
-+   * data6 = z1 + tmp12 * -1.847759065; \
-+   * \
-+   * (This implementation) \
-+   * data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; \
-+   * data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); \
-+   */ \
-+  \
-+  tmp1312l = _mm_unpacklo_epi16(tmp13, tmp12); \
-+  tmp1312h = _mm_unpackhi_epi16(tmp13, tmp12); \
-+  \
-+  out2l = _mm_add_epi32(_mm_madd_epi16(tmp1312l, pw_f130_f054), pd_descale_p##PASS); \
-+  out2h = _mm_add_epi32(_mm_madd_epi16(tmp1312h, pw_f130_f054), pd_descale_p##PASS); \
-+  out6l = _mm_add_epi32(_mm_madd_epi16(tmp1312l, pw_f054_mf130), pd_descale_p##PASS); \
-+  out6h = _mm_add_epi32(_mm_madd_epi16(tmp1312h, pw_f054_mf130), pd_descale_p##PASS); \
-+  \
-+  out2l = _mm_srai_epi32(out2l, DESCALE_P##PASS); \
-+  out2h = _mm_srai_epi32(out2h, DESCALE_P##PASS); \
-+  out6l = _mm_srai_epi32(out6l, DESCALE_P##PASS); \
-+  out6h = _mm_srai_epi32(out6h, DESCALE_P##PASS); \
-+  \
-+  out2 = _mm_packs_epi32(out2l, out2h); \
-+  out6 = _mm_packs_epi32(out6l, out6h); \
-+  \
-+  /* Odd part */ \
-+  \
-+  z3 = _mm_add_epi16(tmp4, tmp6); \
-+  z4 = _mm_add_epi16(tmp5, tmp7); \
-+  \
-+  /* (Original) \
-+   * z5 = (z3 + z4) * 1.175875602; \
-+   * z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644; \
-+   * z3 += z5;  z4 += z5; \
-+   * \
-+   * (This implementation) \
-+   * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
-+   * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
-+   */ \
-+  \
-+  z34l = _mm_unpacklo_epi16(z3, z4); \
-+  z34h = _mm_unpackhi_epi16(z3, z4); \
-+  \
-+  z3l = _mm_add_epi32(_mm_madd_epi16(z34l, pw_mf078_f117), pd_descale_p##PASS); \
-+  z3h = _mm_add_epi32(_mm_madd_epi16(z34h, pw_mf078_f117), pd_descale_p##PASS); \
-+  z4l = _mm_add_epi32(_mm_madd_epi16(z34l, pw_f117_f078), pd_descale_p##PASS); \
-+  z4h = _mm_add_epi32(_mm_madd_epi16(z34h, pw_f117_f078), pd_descale_p##PASS); \
-+  \
-+  /* (Original) \
-+   * z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6; \
-+   * tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869; \
-+   * tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110; \
-+   * z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447; \
-+   * data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4; \
-+   * data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4; \
-+   * \
-+   * (This implementation) \
-+   * tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; \
-+   * tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; \
-+   * tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); \
-+   * tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); \
-+   * data7 = tmp4 + z3;  data5 = tmp5 + z4; \
-+   * data3 = tmp6 + z3;  data1 = tmp7 + z4; \
-+   */ \
-+  \
-+  tmp47l = _mm_unpacklo_epi16(tmp4, tmp7); \
-+  tmp47h = _mm_unpackhi_epi16(tmp4, tmp7); \
-+  \
-+  out7l = _mm_add_epi32(_mm_madd_epi16(tmp47l, pw_mf060_mf089), z3l); \
-+  out7h = _mm_add_epi32(_mm_madd_epi16(tmp47h, pw_mf060_mf089), z3h); \
-+  out1l = _mm_add_epi32(_mm_madd_epi16(tmp47l, pw_mf089_f060), z4l); \
-+  out1h = _mm_add_epi32(_mm_madd_epi16(tmp47h, pw_mf089_f060), z4h); \
-+  \
-+  out7l = _mm_srai_epi32(out7l, DESCALE_P##PASS); \
-+  out7h = _mm_srai_epi32(out7h, DESCALE_P##PASS); \
-+  out1l = _mm_srai_epi32(out1l, DESCALE_P##PASS); \
-+  out1h = _mm_srai_epi32(out1h, DESCALE_P##PASS); \
-+  \
-+  out7 = _mm_packs_epi32(out7l, out7h); \
-+  out1 = _mm_packs_epi32(out1l, out1h); \
-+  \
-+  tmp56l = _mm_unpacklo_epi16(tmp5, tmp6); \
-+  tmp56h = _mm_unpackhi_epi16(tmp5, tmp6); \
-+  \
-+  out5l = _mm_add_epi32(_mm_madd_epi16(tmp56l, pw_mf050_mf256), z4l); \
-+  out5h = _mm_add_epi32(_mm_madd_epi16(tmp56h, pw_mf050_mf256), z4h); \
-+  out3l = _mm_add_epi32(_mm_madd_epi16(tmp56l, pw_mf256_f050), z3l); \
-+  out3h = _mm_add_epi32(_mm_madd_epi16(tmp56h, pw_mf256_f050), z3h); \
-+  \
-+  out5l = _mm_srai_epi32(out5l, DESCALE_P##PASS); \
-+  out5h = _mm_srai_epi32(out5h, DESCALE_P##PASS); \
-+  out3l = _mm_srai_epi32(out3l, DESCALE_P##PASS); \
-+  out3h = _mm_srai_epi32(out3h, DESCALE_P##PASS); \
-+  \
-+  out5 = _mm_packs_epi32(out5l, out5h); \
-+  out3 = _mm_packs_epi32(out3l, out3h); \
-+}
-+
-+#define DO_FDCT_PASS1() { \
-+  /* Even part */ \
-+  \
-+  tmp10 = _mm_add_epi16(tmp0, tmp3); \
-+  tmp13 = _mm_sub_epi16(tmp0, tmp3); \
-+  tmp11 = _mm_add_epi16(tmp1, tmp2); \
-+  tmp12 = _mm_sub_epi16(tmp1, tmp2); \
-+  \
-+  out0  = _mm_add_epi16(tmp10, tmp11); \
-+  out0  = _mm_slli_epi16(out0, PASS1_BITS); \
-+  out4  = _mm_sub_epi16(tmp10, tmp11); \
-+  out4  = _mm_slli_epi16(out4, PASS1_BITS); \
-+  \
-+  DO_FDCT_COMMON(1); \
-+}
-+
-+#define DO_FDCT_PASS2() { \
-+  /* Even part */ \
-+  \
-+  tmp10 = _mm_add_epi16(tmp0, tmp3); \
-+  tmp13 = _mm_sub_epi16(tmp0, tmp3); \
-+  tmp11 = _mm_add_epi16(tmp1, tmp2); \
-+  tmp12 = _mm_sub_epi16(tmp1, tmp2); \
-+  \
-+  out0  = _mm_add_epi16(tmp10, tmp11); \
-+  out0  = _mm_add_epi16(out0, pw_descale_p2x); \
-+  out0  = _mm_srai_epi16(out0, PASS1_BITS); \
-+  out4  = _mm_sub_epi16(tmp10, tmp11); \
-+  out4  = _mm_add_epi16(out4, pw_descale_p2x); \
-+  out4  = _mm_srai_epi16(out4, PASS1_BITS); \
-+  \
-+  DO_FDCT_COMMON(2); \
-+}
-+
-+
-+void jsimd_fdct_islow_e2k(DCTELEM *data)
-+{
-+  __m128i row0, row1, row2, row3, row4, row5, row6, row7,
-+    col0, col1, col2, col3, col4, col5, col6, col7,
-+    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
-+    tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h,
-+    z3, z4, z34l, z34h,
-+    out0, out1, out2, out3, out4, out5, out6, out7;
-+  __m128i z3l, z3h, z4l, z4h,
-+    out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h,
-+    out7l, out7h;
-+
-+  /* Constants */
-+  __m128i pw_f130_f054 = _mm_setr_epi16(__4X2(F_0_541 + F_0_765, F_0_541)),
-+    pw_f054_mf130 = _mm_setr_epi16(__4X2(F_0_541, F_0_541 - F_1_847)),
-+    pw_mf078_f117 = _mm_setr_epi16(__4X2(F_1_175 - F_1_961, F_1_175)),
-+    pw_f117_f078 = _mm_setr_epi16(__4X2(F_1_175, F_1_175 - F_0_390)),
-+    pw_mf060_mf089 = _mm_setr_epi16(__4X2(F_0_298 - F_0_899, -F_0_899)),
-+    pw_mf089_f060 = _mm_setr_epi16(__4X2(-F_0_899, F_1_501 - F_0_899)),
-+    pw_mf050_mf256 = _mm_setr_epi16(__4X2(F_2_053 - F_2_562, -F_2_562)),
-+    pw_mf256_f050 = _mm_setr_epi16(__4X2(-F_2_562, F_3_072 - F_2_562)),
-+    pw_descale_p2x = _mm_set1_epi16(1 << (PASS1_BITS - 1)),
-+    pd_descale_p1 = _mm_set1_epi32(1 << (DESCALE_P1 - 1)),
-+    pd_descale_p2 = _mm_set1_epi32(1 << (DESCALE_P2 - 1));
-+
-+  /* Pass 1: process rows */
-+
-+  row0 = VEC_LD(data + 0 * 8);
-+  row1 = VEC_LD(data + 1 * 8);
-+  row2 = VEC_LD(data + 2 * 8);
-+  row3 = VEC_LD(data + 3 * 8);
-+  row4 = VEC_LD(data + 4 * 8);
-+  row5 = VEC_LD(data + 5 * 8);
-+  row6 = VEC_LD(data + 6 * 8);
-+  row7 = VEC_LD(data + 7 * 8);
-+
-+  TRANSPOSE(row, col);
-+
-+  tmp0 = _mm_add_epi16(col0, col7);
-+  tmp7 = _mm_sub_epi16(col0, col7);
-+  tmp1 = _mm_add_epi16(col1, col6);
-+  tmp6 = _mm_sub_epi16(col1, col6);
-+  tmp2 = _mm_add_epi16(col2, col5);
-+  tmp5 = _mm_sub_epi16(col2, col5);
-+  tmp3 = _mm_add_epi16(col3, col4);
-+  tmp4 = _mm_sub_epi16(col3, col4);
-+
-+  DO_FDCT_PASS1();
-+
-+  /* Pass 2: process columns */
-+
-+  TRANSPOSE(out, row);
-+
-+  tmp0 = _mm_add_epi16(row0, row7);
-+  tmp7 = _mm_sub_epi16(row0, row7);
-+  tmp1 = _mm_add_epi16(row1, row6);
-+  tmp6 = _mm_sub_epi16(row1, row6);
-+  tmp2 = _mm_add_epi16(row2, row5);
-+  tmp5 = _mm_sub_epi16(row2, row5);
-+  tmp3 = _mm_add_epi16(row3, row4);
-+  tmp4 = _mm_sub_epi16(row3, row4);
-+
-+  DO_FDCT_PASS2();
-+
-+  VEC_ST(data + 0 * 8, out0);
-+  VEC_ST(data + 1 * 8, out1);
-+  VEC_ST(data + 2 * 8, out2);
-+  VEC_ST(data + 3 * 8, out3);
-+  VEC_ST(data + 4 * 8, out4);
-+  VEC_ST(data + 5 * 8, out5);
-+  VEC_ST(data + 6 * 8, out6);
-+  VEC_ST(data + 7 * 8, out7);
-+}
-diff --git a/simd/e2k/jidctflt-e2k.c b/simd/e2k/jidctflt-e2k.c
-new file mode 100644
-index 0000000..7682965
---- /dev/null
-+++ b/simd/e2k/jidctflt-e2k.c
-@@ -0,0 +1,215 @@
-+/*
-+ * Elbrus optimizations for libjpeg-turbo
-+ *
-+ * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
-+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
-+ *
-+ * This software is provided 'as-is', without any express or implied
-+ * warranty.  In no event will the authors be held liable for any damages
-+ * arising from the use of this software.
-+ *
-+ * Permission is granted to anyone to use this software for any purpose,
-+ * including commercial applications, and to alter it and redistribute it
-+ * freely, subject to the following restrictions:
-+ *
-+ * 1. The origin of this software must not be misrepresented; you must not
-+ *    claim that you wrote the original software. If you use this software
-+ *    in a product, an acknowledgment in the product documentation would be
-+ *    appreciated but is not required.
-+ * 2. Altered source versions must be plainly marked as such, and must not be
-+ *    misrepresented as being the original software.
-+ * 3. This notice may not be removed or altered from any source distribution.
-+ */
-+
-+/* FLOAT INVERSE DCT */
-+
-+#include "jsimd_e2k.h"
-+
-+#define DO_IDCT(in, out) { \
-+  /* Even part */ \
-+  \
-+  tmp10 = _mm_add_ps(in##0, in##4); \
-+  tmp11 = _mm_sub_ps(in##0, in##4); \
-+  \
-+  tmp13 = _mm_add_ps(in##2, in##6); \
-+  tmp12 = _mm_sub_ps(in##2, in##6); \
-+  tmp12 = _mm_sub_ps(_mm_mul_ps(tmp12, pd_f1414), tmp13); \
-+  \
-+  tmp0 = _mm_add_ps(tmp10, tmp13); \
-+  tmp3 = _mm_sub_ps(tmp10, tmp13); \
-+  tmp1 = _mm_add_ps(tmp11, tmp12); \
-+  tmp2 = _mm_sub_ps(tmp11, tmp12); \
-+  \
-+  /* Odd part */ \
-+  \
-+  z13 = _mm_add_ps(in##5, in##3); \
-+  z10 = _mm_sub_ps(in##5, in##3); \
-+  z11 = _mm_add_ps(in##1, in##7); \
-+  z12 = _mm_sub_ps(in##1, in##7); \
-+  \
-+  tmp7 = _mm_add_ps(z11, z13); \
-+  tmp11 = _mm_sub_ps(z11, z13); \
-+  tmp11 = _mm_mul_ps(tmp11, pd_f1414); \
-+  \
-+  z5 = _mm_mul_ps(_mm_add_ps(z10, z12), pd_f1847); \
-+  tmp10 = _mm_sub_ps(z5, _mm_mul_ps(z12, pd_f1082)); \
-+  tmp12 = _mm_sub_ps(z5, _mm_mul_ps(z10, pd_f2613)); \
-+  \
-+  tmp6 = _mm_sub_ps(tmp12, tmp7); \
-+  tmp5 = _mm_sub_ps(tmp11, tmp6); \
-+  tmp4 = _mm_sub_ps(tmp10, tmp5); \
-+  \
-+  out##0 = _mm_add_ps(tmp0, tmp7); \
-+  out##7 = _mm_sub_ps(tmp0, tmp7); \
-+  out##1 = _mm_add_ps(tmp1, tmp6); \
-+  out##6 = _mm_sub_ps(tmp1, tmp6); \
-+  out##2 = _mm_add_ps(tmp2, tmp5); \
-+  out##5 = _mm_sub_ps(tmp2, tmp5); \
-+  out##3 = _mm_add_ps(tmp3, tmp4); \
-+  out##4 = _mm_sub_ps(tmp3, tmp4); \
-+}
-+
-+#define QUANT_MUL(a, b, c, d, l, lo, i) \
-+  out0 = _mm_srai_epi32(_mm_unpack##lo##_epi16(col##a, col##a), 16); \
-+  out1 = _mm_srai_epi32(_mm_unpack##lo##_epi16(col##b, col##b), 16); \
-+  out2 = _mm_srai_epi32(_mm_unpack##lo##_epi16(col##c, col##c), 16); \
-+  out3 = _mm_srai_epi32(_mm_unpack##lo##_epi16(col##d, col##d), 16); \
-+  l##a = _mm_cvtepi32_ps(out0); \
-+  l##b = _mm_cvtepi32_ps(out1); \
-+  l##c = _mm_cvtepi32_ps(out2); \
-+  l##d = _mm_cvtepi32_ps(out3); \
-+  l##a = _mm_mul_ps(l##a, _mm_load_ps(dct_table + a * 8 + i)); \
-+  l##b = _mm_mul_ps(l##b, _mm_load_ps(dct_table + b * 8 + i)); \
-+  l##c = _mm_mul_ps(l##c, _mm_load_ps(dct_table + c * 8 + i)); \
-+  l##d = _mm_mul_ps(l##d, _mm_load_ps(dct_table + d * 8 + i));
-+
-+
-+void jsimd_idct_float_e2k(void *dct_table_, JCOEFPTR coef_block,
-+                          JSAMPARRAY output_buf, JDIMENSION output_col)
-+{
-+  float *dct_table = (float *)dct_table_;
-+
-+  __m128i col0, col1, col2, col3, col4, col5, col6, col7,
-+    out0, out1, out2, out3, out4, out5, out6, out7, row0, row1, row2, row3;
-+  __m128 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
-+    tmp10, tmp11, tmp12, tmp13, z5, z10, z11, z12, z13;
-+  __m128 l0, l1, l2, l3, l4, l5, l6, l7;
-+  __m128 h0, h1, h2, h3, h4, h5, h6, h7;
-+  __m128 x0, x1, x2, x3, x4, x5, x6, x7;
-+  __m128 y0, y1, y2, y3, y4, y5, y6, y7;
-+
-+  /* Constants */
-+  __m128 pd_f1414 = _mm_set1_ps(1.414213562f),
-+    pd_f1847 = _mm_set1_ps(1.847759065f),
-+    pd_f1082 = _mm_set1_ps(1.082392200f),
-+    pd_f2613 = _mm_set1_ps(2.613125930f);
-+
-+  /* Pass 1: process columns */
-+
-+  col0 = VEC_LD(coef_block + 0 * 8);
-+  col1 = VEC_LD(coef_block + 1 * 8);
-+  col2 = VEC_LD(coef_block + 2 * 8);
-+  col3 = VEC_LD(coef_block + 3 * 8);
-+  col4 = VEC_LD(coef_block + 4 * 8);
-+  col5 = VEC_LD(coef_block + 5 * 8);
-+  col6 = VEC_LD(coef_block + 6 * 8);
-+  col7 = VEC_LD(coef_block + 7 * 8);
-+
-+  out1 = _mm_or_si128(col1, col2);
-+  out2 = _mm_or_si128(col3, col4);
-+  out1 = _mm_or_si128(out1, out2);
-+  out3 = _mm_or_si128(col5, col6);
-+  out3 = _mm_or_si128(out3, col7);
-+  out1 = _mm_or_si128(out1, out3);
-+
-+  if (VEC_ISZERO(out1)) {
-+    /* AC terms all zero */
-+
-+    out0 = _mm_srai_epi32(_mm_unpacklo_epi16(col0, col0), 16);
-+    out1 = _mm_srai_epi32(_mm_unpackhi_epi16(col0, col0), 16);
-+    tmp0 = _mm_cvtepi32_ps(out0);
-+    tmp1 = _mm_cvtepi32_ps(out1);
-+    tmp0 = _mm_mul_ps(tmp0, _mm_load_ps(dct_table));
-+    tmp1 = _mm_mul_ps(tmp1, _mm_load_ps(dct_table + 4));
-+
-+    l0 = h0 = _mm_shuffle_ps(tmp0, tmp0, 0x00);
-+    l1 = h1 = _mm_shuffle_ps(tmp0, tmp0, 0x55);
-+    l2 = h2 = _mm_shuffle_ps(tmp0, tmp0, 0xaa);
-+    l3 = h3 = _mm_shuffle_ps(tmp0, tmp0, 0xff);
-+    l4 = h4 = _mm_shuffle_ps(tmp1, tmp1, 0x00);
-+    l5 = h5 = _mm_shuffle_ps(tmp1, tmp1, 0x55);
-+    l6 = h6 = _mm_shuffle_ps(tmp1, tmp1, 0xaa);
-+    l7 = h7 = _mm_shuffle_ps(tmp1, tmp1, 0xff);
-+
-+  } else {
-+
-+    QUANT_MUL(0, 2, 4, 6, l, lo, 0)
-+    QUANT_MUL(1, 3, 5, 7, l, lo, 0)
-+    DO_IDCT(l, x);
-+
-+    QUANT_MUL(0, 2, 4, 6, h, hi, 4)
-+    QUANT_MUL(1, 3, 5, 7, h, hi, 4)
-+    DO_IDCT(h, y);
-+
-+    TRANSPOSE_FLOAT(x0, x1, x2, x3, l0, l1, l2, l3)
-+    TRANSPOSE_FLOAT(x4, x5, x6, x7, h0, h1, h2, h3)
-+    TRANSPOSE_FLOAT(y0, y1, y2, y3, l4, l5, l6, l7)
-+    TRANSPOSE_FLOAT(y4, y5, y6, y7, h4, h5, h6, h7)
-+  }
-+
-+  /* Pass 2: process rows */
-+
-+  DO_IDCT(l, x);
-+  DO_IDCT(h, y);
-+
-+#ifdef JSIMD_SAME_ROUNDING
-+#define OUT_ROUND(i) \
-+  tmp0 = _mm_add_ps(_mm_mul_ps(x##i, pd_f0125), pd_cj_rnd); \
-+  tmp1 = _mm_add_ps(_mm_mul_ps(y##i, pd_f0125), pd_cj_rnd); \
-+  out##i = _mm_packs_epi32(_mm_cvttps_epi32(tmp0), _mm_cvttps_epi32(tmp1));
-+
-+  {
-+    __m128 pd_cj_rnd = _mm_set1_ps(0.5f + CENTERJSAMPLE),
-+      pd_f0125 = _mm_set1_ps(0.125f);
-+
-+    OUT_ROUND(0) OUT_ROUND(1)
-+    OUT_ROUND(2) OUT_ROUND(3)
-+    OUT_ROUND(4) OUT_ROUND(5)
-+    OUT_ROUND(6) OUT_ROUND(7)
-+  }
-+  row0 = _mm_packus_epi16(out0, out1);
-+  row1 = _mm_packus_epi16(out2, out3);
-+  row2 = _mm_packus_epi16(out4, out5);
-+  row3 = _mm_packus_epi16(out6, out7);
-+
-+  TRANSPOSE8(row, col) TRANSPOSE8(col, row) TRANSPOSE8(row, col)
-+#else  /* faster, slightly differ in rounding */
-+#define OUT_ROUND(i, a, b) out##i = _mm_blendv_epi8( \
-+  _mm_slli_epi32(_mm_castps_si128(_mm_add_ps(b, pd_round)), 16), \
-+  _mm_castps_si128(_mm_add_ps(a, pd_round)), pd_mask);
-+
-+  {
-+    __m128i pd_mask = _mm_set1_epi32(0xffff);
-+    __m128 pd_round = _mm_set1_ps((3 << 22 | CENTERJSAMPLE) * 8);
-+
-+    OUT_ROUND(0, x0, x4) OUT_ROUND(1, y0, y4)
-+    OUT_ROUND(2, x1, x5) OUT_ROUND(3, y1, y5)
-+    OUT_ROUND(4, x2, x6) OUT_ROUND(5, y2, y6)
-+    OUT_ROUND(6, x3, x7) OUT_ROUND(7, y3, y7)
-+  }
-+  row0 = _mm_packus_epi16(out0, out1);
-+  row1 = _mm_packus_epi16(out2, out3);
-+  row2 = _mm_packus_epi16(out4, out5);
-+  row3 = _mm_packus_epi16(out6, out7);
-+
-+  TRANSPOSE8(row, out) TRANSPOSE8(out, col)
-+#endif
-+  VEC_STL(output_buf[0] + output_col, col0);
-+  VEC_STH(output_buf[1] + output_col, col0);
-+  VEC_STL(output_buf[2] + output_col, col1);
-+  VEC_STH(output_buf[3] + output_col, col1);
-+  VEC_STL(output_buf[4] + output_col, col2);
-+  VEC_STH(output_buf[5] + output_col, col2);
-+  VEC_STL(output_buf[6] + output_col, col3);
-+  VEC_STH(output_buf[7] + output_col, col3);
-+}
-diff --git a/simd/e2k/jidctfst-e2k.c b/simd/e2k/jidctfst-e2k.c
-new file mode 100644
-index 0000000..18bc425
---- /dev/null
-+++ b/simd/e2k/jidctfst-e2k.c
-@@ -0,0 +1,187 @@
-+/*
-+ * Elbrus optimizations for libjpeg-turbo
-+ *
-+ * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
-+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
-+ *
-+ * This software is provided 'as-is', without any express or implied
-+ * warranty.  In no event will the authors be held liable for any damages
-+ * arising from the use of this software.
-+ *
-+ * Permission is granted to anyone to use this software for any purpose,
-+ * including commercial applications, and to alter it and redistribute it
-+ * freely, subject to the following restrictions:
-+ *
-+ * 1. The origin of this software must not be misrepresented; you must not
-+ *    claim that you wrote the original software. If you use this software
-+ *    in a product, an acknowledgment in the product documentation would be
-+ *    appreciated but is not required.
-+ * 2. Altered source versions must be plainly marked as such, and must not be
-+ *    misrepresented as being the original software.
-+ * 3. This notice may not be removed or altered from any source distribution.
-+ */
-+
-+/* FAST INTEGER INVERSE DCT */
-+
-+#include "jsimd_e2k.h"
-+
-+
-+#define F_1_082  277              /* FIX(1.082392200) */
-+#define F_1_414  362              /* FIX(1.414213562) */
-+#define F_1_847  473              /* FIX(1.847759065) */
-+#define F_2_613  669              /* FIX(2.613125930) */
-+#define F_1_613  (F_2_613 - 256)  /* FIX(2.613125930) - FIX(1) */
-+
-+#define CONST_BITS  8
-+#define PASS1_BITS  2
-+#define PRE_MULTIPLY_SCALE_BITS  2
-+#define CONST_SHIFT  (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
-+
-+
-+#define DO_IDCT(in) { \
-+  /* Even part */ \
-+  \
-+  tmp10 = _mm_add_epi16(in##0, in##4); \
-+  tmp11 = _mm_sub_epi16(in##0, in##4); \
-+  tmp13 = _mm_add_epi16(in##2, in##6); \
-+  \
-+  tmp12 = _mm_sub_epi16(in##2, in##6); \
-+  tmp12 = _mm_slli_epi16(tmp12, PRE_MULTIPLY_SCALE_BITS); \
-+  tmp12 = _mm_mulhi_epi16(tmp12, pw_F1414); \
-+  tmp12 = _mm_sub_epi16(tmp12, tmp13); \
-+  \
-+  tmp0 = _mm_add_epi16(tmp10, tmp13); \
-+  tmp3 = _mm_sub_epi16(tmp10, tmp13); \
-+  tmp1 = _mm_add_epi16(tmp11, tmp12); \
-+  tmp2 = _mm_sub_epi16(tmp11, tmp12); \
-+  \
-+  /* Odd part */ \
-+  \
-+  z13 = _mm_add_epi16(in##5, in##3); \
-+  z10 = _mm_sub_epi16(in##5, in##3); \
-+  z10s = _mm_slli_epi16(z10, PRE_MULTIPLY_SCALE_BITS); \
-+  z11 = _mm_add_epi16(in##1, in##7); \
-+  z12s = _mm_sub_epi16(in##1, in##7); \
-+  z12s = _mm_slli_epi16(z12s, PRE_MULTIPLY_SCALE_BITS); \
-+  \
-+  tmp11 = _mm_sub_epi16(z11, z13); \
-+  tmp11 = _mm_slli_epi16(tmp11, PRE_MULTIPLY_SCALE_BITS); \
-+  tmp11 = _mm_mulhi_epi16(tmp11, pw_F1414); \
-+  \
-+  tmp7 = _mm_add_epi16(z11, z13); \
-+  \
-+  /* To avoid overflow... \
-+   * \
-+   * (Original) \
-+   * tmp12 = -2.613125930 * z10 + z5; \
-+   * \
-+   * (This implementation) \
-+   * tmp12 = (-1.613125930 - 1) * z10 + z5; \
-+   *       = -1.613125930 * z10 - z10 + z5; \
-+   */ \
-+  \
-+  z5 = _mm_add_epi16(z10s, z12s); \
-+  z5 = _mm_mulhi_epi16(z5, pw_F1847); \
-+  \
-+  tmp10 = _mm_mulhi_epi16(z12s, pw_F1082); \
-+  tmp10 = _mm_sub_epi16(tmp10, z5); \
-+  tmp12 = _mm_add_epi16(_mm_mulhi_epi16(z10s, pw_MF1613), z5); \
-+  tmp12 = _mm_sub_epi16(tmp12, z10); \
-+  \
-+  tmp6 = _mm_sub_epi16(tmp12, tmp7); \
-+  tmp5 = _mm_sub_epi16(tmp11, tmp6); \
-+  tmp4 = _mm_add_epi16(tmp10, tmp5); \
-+  \
-+  out0 = _mm_add_epi16(tmp0, tmp7); \
-+  out1 = _mm_add_epi16(tmp1, tmp6); \
-+  out2 = _mm_add_epi16(tmp2, tmp5); \
-+  out3 = _mm_sub_epi16(tmp3, tmp4); \
-+  out4 = _mm_add_epi16(tmp3, tmp4); \
-+  out5 = _mm_sub_epi16(tmp2, tmp5); \
-+  out6 = _mm_sub_epi16(tmp1, tmp6); \
-+  out7 = _mm_sub_epi16(tmp0, tmp7); \
-+}
-+
-+
-+void jsimd_idct_ifast_e2k(void *dct_table_, JCOEFPTR coef_block,
-+                          JSAMPARRAY output_buf, JDIMENSION output_col)
-+{
-+  short *dct_table = (short *)dct_table_;
-+
-+  __m128i row0, row1, row2, row3, row4, row5, row6, row7,
-+    col0, col1, col2, col3, col4, col5, col6, col7,
-+    quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
-+    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
-+    z5, z10, z10s, z11, z12s, z13,
-+    out0, out1, out2, out3, out4, out5, out6, out7;
-+
-+  /* Constants */
-+  __m128i pw_F1414 = _mm_set1_epi16(F_1_414 << CONST_SHIFT),
-+    pw_F1847 = _mm_set1_epi16(F_1_847 << CONST_SHIFT),
-+    pw_MF1613 = _mm_set1_epi16(-F_1_613 << CONST_SHIFT),
-+    pw_F1082 = _mm_set1_epi16(F_1_082 << CONST_SHIFT);
-+
-+  /* Pass 1: process columns */
-+
-+  col0 = VEC_LD(coef_block + 0 * 8);
-+  col1 = VEC_LD(coef_block + 1 * 8);
-+  col2 = VEC_LD(coef_block + 2 * 8);
-+  col3 = VEC_LD(coef_block + 3 * 8);
-+  col4 = VEC_LD(coef_block + 4 * 8);
-+  col5 = VEC_LD(coef_block + 5 * 8);
-+  col6 = VEC_LD(coef_block + 6 * 8);
-+  col7 = VEC_LD(coef_block + 7 * 8);
-+
-+  tmp1 = _mm_or_si128(col1, col2);
-+  tmp2 = _mm_or_si128(col3, col4);
-+  tmp1 = _mm_or_si128(tmp1, tmp2);
-+  tmp3 = _mm_or_si128(col5, col6);
-+  tmp3 = _mm_or_si128(tmp3, col7);
-+  tmp1 = _mm_or_si128(tmp1, tmp3);
-+
-+  quant0 = VEC_LD(dct_table);
-+  col0 = _mm_mullo_epi16(col0, quant0);
-+
-+  if (VEC_ISZERO(tmp1)) {
-+    /* AC terms all zero */
-+
-+    IDCT_SPLAT8(col0);
-+
-+  } else {
-+
-+    quant1 = VEC_LD(dct_table + 1 * 8);
-+    quant2 = VEC_LD(dct_table + 2 * 8);
-+    quant3 = VEC_LD(dct_table + 3 * 8);
-+    quant4 = VEC_LD(dct_table + 4 * 8);
-+    quant5 = VEC_LD(dct_table + 5 * 8);
-+    quant6 = VEC_LD(dct_table + 6 * 8);
-+    quant7 = VEC_LD(dct_table + 7 * 8);
-+
-+    col1 = _mm_mullo_epi16(col1, quant1);
-+    col2 = _mm_mullo_epi16(col2, quant2);
-+    col3 = _mm_mullo_epi16(col3, quant3);
-+    col4 = _mm_mullo_epi16(col4, quant4);
-+    col5 = _mm_mullo_epi16(col5, quant5);
-+    col6 = _mm_mullo_epi16(col6, quant6);
-+    col7 = _mm_mullo_epi16(col7, quant7);
-+
-+    DO_IDCT(col);
-+
-+    TRANSPOSE(out, row);
-+  }
-+
-+  /* Pass 2: process rows */
-+
-+  DO_IDCT(row);
-+
-+  out0 = _mm_srai_epi16(out0, PASS1_BITS + 3);
-+  out1 = _mm_srai_epi16(out1, PASS1_BITS + 3);
-+  out2 = _mm_srai_epi16(out2, PASS1_BITS + 3);
-+  out3 = _mm_srai_epi16(out3, PASS1_BITS + 3);
-+  out4 = _mm_srai_epi16(out4, PASS1_BITS + 3);
-+  out5 = _mm_srai_epi16(out5, PASS1_BITS + 3);
-+  out6 = _mm_srai_epi16(out6, PASS1_BITS + 3);
-+  out7 = _mm_srai_epi16(out7, PASS1_BITS + 3);
-+
-+  IDCT_SAVE();
-+}
-diff --git a/simd/e2k/jidctint-e2k.c b/simd/e2k/jidctint-e2k.c
-new file mode 100644
-index 0000000..7bb79c0
---- /dev/null
-+++ b/simd/e2k/jidctint-e2k.c
-@@ -0,0 +1,294 @@
-+/*
-+ * Elbrus optimizations for libjpeg-turbo
-+ *
-+ * Copyright (C) 2014-2015, 2020, D. R. Commander.  All Rights Reserved.
-+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
-+ *
-+ * This software is provided 'as-is', without any express or implied
-+ * warranty.  In no event will the authors be held liable for any damages
-+ * arising from the use of this software.
-+ *
-+ * Permission is granted to anyone to use this software for any purpose,
-+ * including commercial applications, and to alter it and redistribute it
-+ * freely, subject to the following restrictions:
-+ *
-+ * 1. The origin of this software must not be misrepresented; you must not
-+ *    claim that you wrote the original software. If you use this software
-+ *    in a product, an acknowledgment in the product documentation would be
-+ *    appreciated but is not required.
-+ * 2. Altered source versions must be plainly marked as such, and must not be
-+ *    misrepresented as being the original software.
-+ * 3. This notice may not be removed or altered from any source distribution.
-+ */
-+
-+/* ACCURATE INTEGER INVERSE DCT */
-+
-+#include "jsimd_e2k.h"
-+
-+
-+#define F_0_298  2446   /* FIX(0.298631336) */
-+#define F_0_390  3196   /* FIX(0.390180644) */
-+#define F_0_541  4433   /* FIX(0.541196100) */
-+#define F_0_765  6270   /* FIX(0.765366865) */
-+#define F_0_899  7373   /* FIX(0.899976223) */
-+#define F_1_175  9633   /* FIX(1.175875602) */
-+#define F_1_501  12299  /* FIX(1.501321110) */
-+#define F_1_847  15137  /* FIX(1.847759065) */
-+#define F_1_961  16069  /* FIX(1.961570560) */
-+#define F_2_053  16819  /* FIX(2.053119869) */
-+#define F_2_562  20995  /* FIX(2.562915447) */
-+#define F_3_072  25172  /* FIX(3.072711026) */
-+
-+#define CONST_BITS  13
-+#define PASS1_BITS  2
-+#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
-+#define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
-+
-+
-+#define DO_IDCT(in, PASS) { \
-+  /* Even part \
-+   * \
-+   * (Original) \
-+   * z1 = (z2 + z3) * 0.541196100; \
-+   * tmp2 = z1 + z3 * -1.847759065; \
-+   * tmp3 = z1 + z2 * 0.765366865; \
-+   * \
-+   * (This implementation) \
-+   * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \
-+   * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \
-+   */ \
-+  \
-+  in##26l = _mm_unpacklo_epi16(in##2, in##6); \
-+  in##26h = _mm_unpackhi_epi16(in##2, in##6); \
-+  \
-+  tmp3l = _mm_madd_epi16(in##26l, pw_f130_f054); \
-+  tmp3h = _mm_madd_epi16(in##26h, pw_f130_f054); \
-+  tmp2l = _mm_madd_epi16(in##26l, pw_f054_mf130); \
-+  tmp2h = _mm_madd_epi16(in##26h, pw_f054_mf130); \
-+  \
-+  tmp0 = _mm_add_epi16(in##0, in##4); \
-+  tmp1 = _mm_sub_epi16(in##0, in##4); \
-+  \
-+  tmp0l = _mm_unpacklo_epi16(pw_zero, tmp0); \
-+  tmp0h = _mm_unpackhi_epi16(pw_zero, tmp0); \
-+  tmp0l = _mm_srai_epi32(tmp0l, 16 - CONST_BITS); \
-+  tmp0h = _mm_srai_epi32(tmp0h, 16 - CONST_BITS); \
-+  tmp0l = _mm_add_epi32(tmp0l, pd_descale_p##PASS); \
-+  tmp0h = _mm_add_epi32(tmp0h, pd_descale_p##PASS); \
-+  \
-+  tmp10l = _mm_add_epi32(tmp0l, tmp3l); \
-+  tmp10h = _mm_add_epi32(tmp0h, tmp3h); \
-+  tmp13l = _mm_sub_epi32(tmp0l, tmp3l); \
-+  tmp13h = _mm_sub_epi32(tmp0h, tmp3h); \
-+  \
-+  tmp1l = _mm_unpacklo_epi16(pw_zero, tmp1); \
-+  tmp1h = _mm_unpackhi_epi16(pw_zero, tmp1); \
-+  tmp1l = _mm_srai_epi32(tmp1l, 16 - CONST_BITS); \
-+  tmp1h = _mm_srai_epi32(tmp1h, 16 - CONST_BITS); \
-+  tmp1l = _mm_add_epi32(tmp1l, pd_descale_p##PASS); \
-+  tmp1h = _mm_add_epi32(tmp1h, pd_descale_p##PASS); \
-+  \
-+  tmp11l = _mm_add_epi32(tmp1l, tmp2l); \
-+  tmp11h = _mm_add_epi32(tmp1h, tmp2h); \
-+  tmp12l = _mm_sub_epi32(tmp1l, tmp2l); \
-+  tmp12h = _mm_sub_epi32(tmp1h, tmp2h); \
-+  \
-+  /* Odd part */ \
-+  \
-+  z3 = _mm_add_epi16(in##3, in##7); \
-+  z4 = _mm_add_epi16(in##1, in##5); \
-+  \
-+  /* (Original) \
-+   * z5 = (z3 + z4) * 1.175875602; \
-+   * z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644; \
-+   * z3 += z5;  z4 += z5; \
-+   * \
-+   * (This implementation) \
-+   * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
-+   * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
-+   */ \
-+  \
-+  z34l = _mm_unpacklo_epi16(z3, z4); \
-+  z34h = _mm_unpackhi_epi16(z3, z4); \
-+  \
-+  z3l = _mm_madd_epi16(z34l, pw_mf078_f117); \
-+  z3h = _mm_madd_epi16(z34h, pw_mf078_f117); \
-+  z4l = _mm_madd_epi16(z34l, pw_f117_f078); \
-+  z4h = _mm_madd_epi16(z34h, pw_f117_f078); \
-+  \
-+  /* (Original) \
-+   * z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2; \
-+   * tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869; \
-+   * tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110; \
-+   * z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447; \
-+   * tmp0 += z1 + z3;  tmp1 += z2 + z4; \
-+   * tmp2 += z2 + z3;  tmp3 += z1 + z4; \
-+   * \
-+   * (This implementation) \
-+   * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; \
-+   * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; \
-+   * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); \
-+   * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); \
-+   * tmp0 += z3;  tmp1 += z4; \
-+   * tmp2 += z3;  tmp3 += z4; \
-+   */ \
-+  \
-+  in##71l = _mm_unpacklo_epi16(in##7, in##1); \
-+  in##71h = _mm_unpackhi_epi16(in##7, in##1); \
-+  \
-+  tmp0l = _mm_add_epi32(_mm_madd_epi16(in##71l, pw_mf060_mf089), z3l); \
-+  tmp0h = _mm_add_epi32(_mm_madd_epi16(in##71h, pw_mf060_mf089), z3h); \
-+  tmp3l = _mm_add_epi32(_mm_madd_epi16(in##71l, pw_mf089_f060), z4l); \
-+  tmp3h = _mm_add_epi32(_mm_madd_epi16(in##71h, pw_mf089_f060), z4h); \
-+  \
-+  in##53l = _mm_unpacklo_epi16(in##5, in##3); \
-+  in##53h = _mm_unpackhi_epi16(in##5, in##3); \
-+  \
-+  tmp1l = _mm_add_epi32(_mm_madd_epi16(in##53l, pw_mf050_mf256), z4l); \
-+  tmp1h = _mm_add_epi32(_mm_madd_epi16(in##53h, pw_mf050_mf256), z4h); \
-+  tmp2l = _mm_add_epi32(_mm_madd_epi16(in##53l, pw_mf256_f050), z3l); \
-+  tmp2h = _mm_add_epi32(_mm_madd_epi16(in##53h, pw_mf256_f050), z3h); \
-+  \
-+  /* Final output stage */ \
-+  \
-+  out0l = _mm_add_epi32(tmp10l, tmp3l); \
-+  out0h = _mm_add_epi32(tmp10h, tmp3h); \
-+  out7l = _mm_sub_epi32(tmp10l, tmp3l); \
-+  out7h = _mm_sub_epi32(tmp10h, tmp3h); \
-+  \
-+  out0l = _mm_srai_epi32(out0l, DESCALE_P##PASS); \
-+  out0h = _mm_srai_epi32(out0h, DESCALE_P##PASS); \
-+  out7l = _mm_srai_epi32(out7l, DESCALE_P##PASS); \
-+  out7h = _mm_srai_epi32(out7h, DESCALE_P##PASS); \
-+  \
-+  out0 = _mm_packs_epi32(out0l, out0h); \
-+  out7 = _mm_packs_epi32(out7l, out7h); \
-+  \
-+  out1l = _mm_add_epi32(tmp11l, tmp2l); \
-+  out1h = _mm_add_epi32(tmp11h, tmp2h); \
-+  out6l = _mm_sub_epi32(tmp11l, tmp2l); \
-+  out6h = _mm_sub_epi32(tmp11h, tmp2h); \
-+  \
-+  out1l = _mm_srai_epi32(out1l, DESCALE_P##PASS); \
-+  out1h = _mm_srai_epi32(out1h, DESCALE_P##PASS); \
-+  out6l = _mm_srai_epi32(out6l, DESCALE_P##PASS); \
-+  out6h = _mm_srai_epi32(out6h, DESCALE_P##PASS); \
-+  \
-+  out1 = _mm_packs_epi32(out1l, out1h); \
-+  out6 = _mm_packs_epi32(out6l, out6h); \
-+  \
-+  out2l = _mm_add_epi32(tmp12l, tmp1l); \
-+  out2h = _mm_add_epi32(tmp12h, tmp1h); \
-+  out5l = _mm_sub_epi32(tmp12l, tmp1l); \
-+  out5h = _mm_sub_epi32(tmp12h, tmp1h); \
-+  \
-+  out2l = _mm_srai_epi32(out2l, DESCALE_P##PASS); \
-+  out2h = _mm_srai_epi32(out2h, DESCALE_P##PASS); \
-+  out5l = _mm_srai_epi32(out5l, DESCALE_P##PASS); \
-+  out5h = _mm_srai_epi32(out5h, DESCALE_P##PASS); \
-+  \
-+  out2 = _mm_packs_epi32(out2l, out2h); \
-+  out5 = _mm_packs_epi32(out5l, out5h); \
-+  \
-+  out3l = _mm_add_epi32(tmp13l, tmp0l); \
-+  out3h = _mm_add_epi32(tmp13h, tmp0h); \
-+  out4l = _mm_sub_epi32(tmp13l, tmp0l); \
-+  out4h = _mm_sub_epi32(tmp13h, tmp0h); \
-+  \
-+  out3l = _mm_srai_epi32(out3l, DESCALE_P##PASS); \
-+  out3h = _mm_srai_epi32(out3h, DESCALE_P##PASS); \
-+  out4l = _mm_srai_epi32(out4l, DESCALE_P##PASS); \
-+  out4h = _mm_srai_epi32(out4h, DESCALE_P##PASS); \
-+  \
-+  out3 = _mm_packs_epi32(out3l, out3h); \
-+  out4 = _mm_packs_epi32(out4l, out4h); \
-+}
-+
-+
-+void jsimd_idct_islow_e2k(void *dct_table_, JCOEFPTR coef_block,
-+                          JSAMPARRAY output_buf, JDIMENSION output_col)
-+{
-+  short *dct_table = (short *)dct_table_;
-+
-+  __m128i row0, row1, row2, row3, row4, row5, row6, row7,
-+    col0, col1, col2, col3, col4, col5, col6, col7,
-+    quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
-+    tmp0, tmp1, tmp2, tmp3, z3, z4,
-+    z34l, z34h, col71l, col71h, col26l, col26h, col53l, col53h,
-+    row71l, row71h, row26l, row26h, row53l, row53h,
-+    out0, out1, out2, out3, out4, out5, out6, out7;
-+  __m128i tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h,
-+    tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h,
-+    z3l, z3h, z4l, z4h,
-+    out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h, out4l, out4h,
-+    out5l, out5h, out6l, out6h, out7l, out7h;
-+
-+  /* Constants */
-+  __m128i pw_zero = _mm_setzero_si128(),
-+    pw_f130_f054 = _mm_setr_epi16(__4X2(F_0_541 + F_0_765, F_0_541)),
-+    pw_f054_mf130 = _mm_setr_epi16(__4X2(F_0_541, F_0_541 - F_1_847)),
-+    pw_mf078_f117 = _mm_setr_epi16(__4X2(F_1_175 - F_1_961, F_1_175)),
-+    pw_f117_f078 = _mm_setr_epi16(__4X2(F_1_175, F_1_175 - F_0_390)),
-+    pw_mf060_mf089 = _mm_setr_epi16(__4X2(F_0_298 - F_0_899, -F_0_899)),
-+    pw_mf089_f060 = _mm_setr_epi16(__4X2(-F_0_899, F_1_501 - F_0_899)),
-+    pw_mf050_mf256 = _mm_setr_epi16(__4X2(F_2_053 - F_2_562, -F_2_562)),
-+    pw_mf256_f050 = _mm_setr_epi16(__4X2(-F_2_562, F_3_072 - F_2_562)),
-+    pd_descale_p1 = _mm_set1_epi32(1 << (DESCALE_P1 - 1)),
-+    pd_descale_p2 = _mm_set1_epi32(1 << (DESCALE_P2 - 1));
-+
-+  /* Pass 1: process columns */
-+
-+  col0 = VEC_LD(coef_block + 0 * 8);
-+  col1 = VEC_LD(coef_block + 1 * 8);
-+  col2 = VEC_LD(coef_block + 2 * 8);
-+  col3 = VEC_LD(coef_block + 3 * 8);
-+  col4 = VEC_LD(coef_block + 4 * 8);
-+  col5 = VEC_LD(coef_block + 5 * 8);
-+  col6 = VEC_LD(coef_block + 6 * 8);
-+  col7 = VEC_LD(coef_block + 7 * 8);
-+
-+  tmp1 = _mm_or_si128(col1, col2);
-+  tmp2 = _mm_or_si128(col3, col4);
-+  tmp1 = _mm_or_si128(tmp1, tmp2);
-+  tmp3 = _mm_or_si128(col5, col6);
-+  tmp3 = _mm_or_si128(tmp3, col7);
-+  tmp1 = _mm_or_si128(tmp1, tmp3);
-+
-+  quant0 = VEC_LD(dct_table);
-+  col0 = _mm_mullo_epi16(col0, quant0);
-+
-+  if (VEC_ISZERO(tmp1)) {
-+    /* AC terms all zero */
-+
-+    col0 = _mm_slli_epi16(col0, PASS1_BITS);
-+    IDCT_SPLAT8(col0);
-+
-+  } else {
-+
-+    quant1 = VEC_LD(dct_table + 1 * 8);
-+    quant2 = VEC_LD(dct_table + 2 * 8);
-+    quant3 = VEC_LD(dct_table + 3 * 8);
-+    quant4 = VEC_LD(dct_table + 4 * 8);
-+    quant5 = VEC_LD(dct_table + 5 * 8);
-+    quant6 = VEC_LD(dct_table + 6 * 8);
-+    quant7 = VEC_LD(dct_table + 7 * 8);
-+
-+    col1 = _mm_mullo_epi16(col1, quant1);
-+    col2 = _mm_mullo_epi16(col2, quant2);
-+    col3 = _mm_mullo_epi16(col3, quant3);
-+    col4 = _mm_mullo_epi16(col4, quant4);
-+    col5 = _mm_mullo_epi16(col5, quant5);
-+    col6 = _mm_mullo_epi16(col6, quant6);
-+    col7 = _mm_mullo_epi16(col7, quant7);
-+
-+    DO_IDCT(col, 1);
-+
-+    TRANSPOSE(out, row);
-+  }
-+
-+  /* Pass 2: process rows */
-+
-+  DO_IDCT(row, 2);
-+
-+  IDCT_SAVE();
-+}
-diff --git a/simd/e2k/jquantf-e2k.c b/simd/e2k/jquantf-e2k.c
-new file mode 100644
-index 0000000..106e99a
---- /dev/null
-+++ b/simd/e2k/jquantf-e2k.c
-@@ -0,0 +1,121 @@
-+/*
-+ * Elbrus optimizations for libjpeg-turbo
-+ *
-+ * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
-+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
-+ *
-+ * This software is provided 'as-is', without any express or implied
-+ * warranty.  In no event will the authors be held liable for any damages
-+ * arising from the use of this software.
-+ *
-+ * Permission is granted to anyone to use this software for any purpose,
-+ * including commercial applications, and to alter it and redistribute it
-+ * freely, subject to the following restrictions:
-+ *
-+ * 1. The origin of this software must not be misrepresented; you must not
-+ *    claim that you wrote the original software. If you use this software
-+ *    in a product, an acknowledgment in the product documentation would be
-+ *    appreciated but is not required.
-+ * 2. Altered source versions must be plainly marked as such, and must not be
-+ *    misrepresented as being the original software.
-+ * 3. This notice may not be removed or altered from any source distribution.
-+ */
-+
-+/* FLOAT QUANTIZATION AND SAMPLE CONVERSION */
-+
-+#include "jsimd_e2k.h"
-+
-+#define LOAD_ROW(row) in##row = VEC_LD8(sample_data[row] + start_col)
-+#define STORE_ROW(i) \
-+  in0 = _mm_unpacklo_epi16(out##i, pb_zero); \
-+  in1 = _mm_unpackhi_epi16(out##i, pb_zero); \
-+  in0 = _mm_sub_epi32(in0, pd_cj); \
-+  in1 = _mm_sub_epi32(in1, pd_cj); \
-+  _mm_storeu_ps(workspace + i * 8, _mm_cvtepi32_ps(in0)); \
-+  _mm_storeu_ps(workspace + i * 8 + 4, _mm_cvtepi32_ps(in1));
-+
-+void jsimd_convsamp_float_e2k(JSAMPARRAY sample_data, JDIMENSION start_col,
-+                              FAST_FLOAT *workspace)
-+{
-+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-+  __m128i out0, out1, out2, out3, out4, out5, out6, out7;
-+
-+  /* Constants */
-+  __m128i pd_cj = _mm_set1_epi32(CENTERJSAMPLE),
-+    pb_zero = _mm_setzero_si128();
-+
-+  LOAD_ROW(0);
-+  LOAD_ROW(1);
-+  LOAD_ROW(2);
-+  LOAD_ROW(3);
-+  LOAD_ROW(4);
-+  LOAD_ROW(5);
-+  LOAD_ROW(6);
-+  LOAD_ROW(7);
-+
-+  out0 = _mm_unpacklo_epi8(in0, pb_zero);
-+  out1 = _mm_unpacklo_epi8(in1, pb_zero);
-+  out2 = _mm_unpacklo_epi8(in2, pb_zero);
-+  out3 = _mm_unpacklo_epi8(in3, pb_zero);
-+  out4 = _mm_unpacklo_epi8(in4, pb_zero);
-+  out5 = _mm_unpacklo_epi8(in5, pb_zero);
-+  out6 = _mm_unpacklo_epi8(in6, pb_zero);
-+  out7 = _mm_unpacklo_epi8(in7, pb_zero);
-+
-+  STORE_ROW(0)
-+  STORE_ROW(1)
-+  STORE_ROW(2)
-+  STORE_ROW(3)
-+  STORE_ROW(4)
-+  STORE_ROW(5)
-+  STORE_ROW(6)
-+  STORE_ROW(7)
-+}
-+
-+void jsimd_quantize_float_e2k(JCOEFPTR coef_block, FAST_FLOAT *divisors,
-+                              FAST_FLOAT *workspace)
-+{
-+  int i = 0;
-+  __m128 row0, row1, row2, row3, recip0, recip1, recip2, recip3;
-+  __m128i out0, out1;
-+#ifdef JSIMD_SAME_ROUNDING
-+  __m128 pd_f16k5 = _mm_set1_ps(16384.5f);
-+  __m128i pw_m16k = _mm_set1_epi16(-16384);
-+#endif
-+
-+  PRAGMA_E2K("ivdep")
-+  for (; i < 4; i++, workspace += 16, divisors += 16, coef_block += 16) {
-+    row0 = _mm_loadu_ps(workspace + 0 * 4);
-+    row1 = _mm_loadu_ps(workspace + 1 * 4);
-+    row2 = _mm_loadu_ps(workspace + 2 * 4);
-+    row3 = _mm_loadu_ps(workspace + 3 * 4);
-+
-+    recip0 = _mm_loadu_ps(divisors + 0 * 4);
-+    recip1 = _mm_loadu_ps(divisors + 1 * 4);
-+    recip2 = _mm_loadu_ps(divisors + 2 * 4);
-+    recip3 = _mm_loadu_ps(divisors + 3 * 4);
-+
-+    row0 = _mm_mul_ps(row0, recip0);
-+    row1 = _mm_mul_ps(row1, recip1);
-+    row2 = _mm_mul_ps(row2, recip2);
-+    row3 = _mm_mul_ps(row3, recip3);
-+
-+#ifdef JSIMD_SAME_ROUNDING
-+    row0 = _mm_add_ps(row0, pd_f16k5);
-+    row1 = _mm_add_ps(row1, pd_f16k5);
-+    row2 = _mm_add_ps(row2, pd_f16k5);
-+    row3 = _mm_add_ps(row3, pd_f16k5);
-+
-+    out0 = _mm_packs_epi32(_mm_cvttps_epi32(row0), _mm_cvttps_epi32(row1));
-+    out1 = _mm_packs_epi32(_mm_cvttps_epi32(row2), _mm_cvttps_epi32(row3));
-+
-+    out0 = _mm_add_epi16(out0, pw_m16k);
-+    out1 = _mm_add_epi16(out1, pw_m16k);
-+#else
-+    out0 = _mm_packs_epi32(_mm_cvtps_epi32(row0), _mm_cvtps_epi32(row1));
-+    out1 = _mm_packs_epi32(_mm_cvtps_epi32(row2), _mm_cvtps_epi32(row3));
-+#endif
-+    VEC_ST(coef_block, out0);
-+    VEC_ST(coef_block + 8, out1);
-+  }
-+}
-diff --git a/simd/e2k/jquanti-e2k.c b/simd/e2k/jquanti-e2k.c
-new file mode 100644
-index 0000000..a3e1ff1
---- /dev/null
-+++ b/simd/e2k/jquanti-e2k.c
-@@ -0,0 +1,178 @@
-+/*
-+ * Elbrus optimizations for libjpeg-turbo
-+ *
-+ * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
-+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
-+ *
-+ * This software is provided 'as-is', without any express or implied
-+ * warranty.  In no event will the authors be held liable for any damages
-+ * arising from the use of this software.
-+ *
-+ * Permission is granted to anyone to use this software for any purpose,
-+ * including commercial applications, and to alter it and redistribute it
-+ * freely, subject to the following restrictions:
-+ *
-+ * 1. The origin of this software must not be misrepresented; you must not
-+ *    claim that you wrote the original software. If you use this software
-+ *    in a product, an acknowledgment in the product documentation would be
-+ *    appreciated but is not required.
-+ * 2. Altered source versions must be plainly marked as such, and must not be
-+ *    misrepresented as being the original software.
-+ * 3. This notice may not be removed or altered from any source distribution.
-+ */
-+
-+/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */
-+
-+#include "jsimd_e2k.h"
-+
-+#define LOAD_ROW(row) in##row = VEC_LD8(sample_data[row] + start_col)
-+
-+void jsimd_convsamp_e2k(JSAMPARRAY sample_data, JDIMENSION start_col,
-+                        DCTELEM *workspace)
-+{
-+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-+  __m128i out0, out1, out2, out3, out4, out5, out6, out7;
-+
-+  /* Constants */
-+  __m128i pw_cj = _mm_set1_epi16(CENTERJSAMPLE),
-+    pb_zero = _mm_setzero_si128();
-+
-+  LOAD_ROW(0);
-+  LOAD_ROW(1);
-+  LOAD_ROW(2);
-+  LOAD_ROW(3);
-+  LOAD_ROW(4);
-+  LOAD_ROW(5);
-+  LOAD_ROW(6);
-+  LOAD_ROW(7);
-+
-+  out0 = _mm_unpacklo_epi8(in0, pb_zero);
-+  out1 = _mm_unpacklo_epi8(in1, pb_zero);
-+  out2 = _mm_unpacklo_epi8(in2, pb_zero);
-+  out3 = _mm_unpacklo_epi8(in3, pb_zero);
-+  out4 = _mm_unpacklo_epi8(in4, pb_zero);
-+  out5 = _mm_unpacklo_epi8(in5, pb_zero);
-+  out6 = _mm_unpacklo_epi8(in6, pb_zero);
-+  out7 = _mm_unpacklo_epi8(in7, pb_zero);
-+
-+  out0 = _mm_sub_epi16(out0, pw_cj);
-+  out1 = _mm_sub_epi16(out1, pw_cj);
-+  out2 = _mm_sub_epi16(out2, pw_cj);
-+  out3 = _mm_sub_epi16(out3, pw_cj);
-+  out4 = _mm_sub_epi16(out4, pw_cj);
-+  out5 = _mm_sub_epi16(out5, pw_cj);
-+  out6 = _mm_sub_epi16(out6, pw_cj);
-+  out7 = _mm_sub_epi16(out7, pw_cj);
-+
-+  VEC_ST(workspace + 0 * 8, out0);
-+  VEC_ST(workspace + 1 * 8, out1);
-+  VEC_ST(workspace + 2 * 8, out2);
-+  VEC_ST(workspace + 3 * 8, out3);
-+  VEC_ST(workspace + 4 * 8, out4);
-+  VEC_ST(workspace + 5 * 8, out5);
-+  VEC_ST(workspace + 6 * 8, out6);
-+  VEC_ST(workspace + 7 * 8, out7);
-+}
-+
-+
-+#define WORD_BIT  16
-+#define MULTIPLY(vs0, vs1, out) out = _mm_mulhi_epu16(vs0, vs1)
-+
-+void jsimd_quantize_e2k(JCOEFPTR coef_block, DCTELEM *divisors,
-+                        DCTELEM *workspace)
-+{
-+  __m128i row0, row1, row2, row3, row4, row5, row6, row7,
-+    row0s, row1s, row2s, row3s, row4s, row5s, row6s, row7s,
-+    corr0, corr1, corr2, corr3, corr4, corr5, corr6, corr7,
-+    recip0, recip1, recip2, recip3, recip4, recip5, recip6, recip7,
-+    scale0, scale1, scale2, scale3, scale4, scale5, scale6, scale7;
-+
-+  row0s = VEC_LD(workspace + 0 * 8);
-+  row1s = VEC_LD(workspace + 1 * 8);
-+  row2s = VEC_LD(workspace + 2 * 8);
-+  row3s = VEC_LD(workspace + 3 * 8);
-+  row4s = VEC_LD(workspace + 4 * 8);
-+  row5s = VEC_LD(workspace + 5 * 8);
-+  row6s = VEC_LD(workspace + 6 * 8);
-+  row7s = VEC_LD(workspace + 7 * 8);
-+  row0 = _mm_abs_epi16(row0s);
-+  row1 = _mm_abs_epi16(row1s);
-+  row2 = _mm_abs_epi16(row2s);
-+  row3 = _mm_abs_epi16(row3s);
-+  row4 = _mm_abs_epi16(row4s);
-+  row5 = _mm_abs_epi16(row5s);
-+  row6 = _mm_abs_epi16(row6s);
-+  row7 = _mm_abs_epi16(row7s);
-+
-+  corr0 = VEC_LD(divisors + DCTSIZE2 + 0 * 8);
-+  corr1 = VEC_LD(divisors + DCTSIZE2 + 1 * 8);
-+  corr2 = VEC_LD(divisors + DCTSIZE2 + 2 * 8);
-+  corr3 = VEC_LD(divisors + DCTSIZE2 + 3 * 8);
-+  corr4 = VEC_LD(divisors + DCTSIZE2 + 4 * 8);
-+  corr5 = VEC_LD(divisors + DCTSIZE2 + 5 * 8);
-+  corr6 = VEC_LD(divisors + DCTSIZE2 + 6 * 8);
-+  corr7 = VEC_LD(divisors + DCTSIZE2 + 7 * 8);
-+
-+  row0 = _mm_add_epi16(row0, corr0);
-+  row1 = _mm_add_epi16(row1, corr1);
-+  row2 = _mm_add_epi16(row2, corr2);
-+  row3 = _mm_add_epi16(row3, corr3);
-+  row4 = _mm_add_epi16(row4, corr4);
-+  row5 = _mm_add_epi16(row5, corr5);
-+  row6 = _mm_add_epi16(row6, corr6);
-+  row7 = _mm_add_epi16(row7, corr7);
-+
-+  recip0 = VEC_LD(divisors + 0 * 8);
-+  recip1 = VEC_LD(divisors + 1 * 8);
-+  recip2 = VEC_LD(divisors + 2 * 8);
-+  recip3 = VEC_LD(divisors + 3 * 8);
-+  recip4 = VEC_LD(divisors + 4 * 8);
-+  recip5 = VEC_LD(divisors + 5 * 8);
-+  recip6 = VEC_LD(divisors + 6 * 8);
-+  recip7 = VEC_LD(divisors + 7 * 8);
-+
-+  MULTIPLY(row0, recip0, row0);
-+  MULTIPLY(row1, recip1, row1);
-+  MULTIPLY(row2, recip2, row2);
-+  MULTIPLY(row3, recip3, row3);
-+  MULTIPLY(row4, recip4, row4);
-+  MULTIPLY(row5, recip5, row5);
-+  MULTIPLY(row6, recip6, row6);
-+  MULTIPLY(row7, recip7, row7);
-+
-+  scale0 = VEC_LD(divisors + DCTSIZE2 * 2 + 0 * 8);
-+  scale1 = VEC_LD(divisors + DCTSIZE2 * 2 + 1 * 8);
-+  scale2 = VEC_LD(divisors + DCTSIZE2 * 2 + 2 * 8);
-+  scale3 = VEC_LD(divisors + DCTSIZE2 * 2 + 3 * 8);
-+  scale4 = VEC_LD(divisors + DCTSIZE2 * 2 + 4 * 8);
-+  scale5 = VEC_LD(divisors + DCTSIZE2 * 2 + 5 * 8);
-+  scale6 = VEC_LD(divisors + DCTSIZE2 * 2 + 6 * 8);
-+  scale7 = VEC_LD(divisors + DCTSIZE2 * 2 + 7 * 8);
-+
-+  MULTIPLY(row0, scale0, row0);
-+  MULTIPLY(row1, scale1, row1);
-+  MULTIPLY(row2, scale2, row2);
-+  MULTIPLY(row3, scale3, row3);
-+  MULTIPLY(row4, scale4, row4);
-+  MULTIPLY(row5, scale5, row5);
-+  MULTIPLY(row6, scale6, row6);
-+  MULTIPLY(row7, scale7, row7);
-+
-+  row0 = _mm_sign_epi16(row0, row0s);
-+  row1 = _mm_sign_epi16(row1, row1s);
-+  row2 = _mm_sign_epi16(row2, row2s);
-+  row3 = _mm_sign_epi16(row3, row3s);
-+  row4 = _mm_sign_epi16(row4, row4s);
-+  row5 = _mm_sign_epi16(row5, row5s);
-+  row6 = _mm_sign_epi16(row6, row6s);
-+  row7 = _mm_sign_epi16(row7, row7s);
-+
-+  VEC_ST(coef_block + 0 * 8, row0);
-+  VEC_ST(coef_block + 1 * 8, row1);
-+  VEC_ST(coef_block + 2 * 8, row2);
-+  VEC_ST(coef_block + 3 * 8, row3);
-+  VEC_ST(coef_block + 4 * 8, row4);
-+  VEC_ST(coef_block + 5 * 8, row5);
-+  VEC_ST(coef_block + 6 * 8, row6);
-+  VEC_ST(coef_block + 7 * 8, row7);
-+}
-diff --git a/simd/e2k/jsimd.c b/simd/e2k/jsimd.c
-new file mode 100644
-index 0000000..f8c0465
---- /dev/null
-+++ b/simd/e2k/jsimd.c
-@@ -0,0 +1,761 @@
-+/*
-+ * jsimd_e2k.c
-+ *
-+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-+ * Copyright (C) 2009-2011, 2014-2016, 2018, D. R. Commander.
-+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
-+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
-+ *
-+ * Based on the x86 SIMD extension for IJG JPEG library,
-+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
-+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
-+ *
-+ * This file contains the interface between the "normal" portions
-+ * of the library and the SIMD implementations when running on a
-+ * PowerPC architecture.
-+ */
-+
-+#define JPEG_INTERNALS
-+#include "../../jinclude.h"
-+#include "../../jpeglib.h"
-+#include "../../jsimd.h"
-+#include "../../jdct.h"
-+#include "../../jsimddct.h"
-+#include "../jsimd.h"
-+#include "jsimd_api_e2k.h"
-+
-+static unsigned int simd_support = ~0;
-+static unsigned int simd_huffman = 1;
-+
-+/*
-+ * Check what SIMD accelerations are supported.
-+ *
-+ * FIXME: This code is racy under a multi-threaded environment.
-+ */
-+LOCAL(void)
-+init_simd(void)
-+{
-+#ifndef NO_GETENV
-+  char *env = NULL;
-+#endif
-+
-+  if (simd_support != ~0U)
-+    return;
-+
-+  simd_support = JSIMD_SSE2;
-+
-+#ifndef NO_GETENV
-+  /* Force different settings through environment variables */
-+  env = getenv("JSIMD_FORCENONE");
-+  if ((env != NULL) && (strcmp(env, "1") == 0))
-+    simd_support = 0;
-+  env = getenv("JSIMD_NOHUFFENC");
-+  if ((env != NULL) && (strcmp(env, "1") == 0))
-+    simd_huffman = 0;
-+#endif
-+}
-+
-+static inline int color_space_idx(J_COLOR_SPACE color_space) {
-+  switch (color_space) {
-+  case JCS_EXT_RGB:
-+    return 1 + (EXT_RGB_PIXELSIZE != 3) * 16;
-+  case JCS_EXT_RGBX:
-+  case JCS_EXT_RGBA:
-+    return 2 + (EXT_RGBX_PIXELSIZE != 3) * 16;
-+  case JCS_EXT_BGR:
-+    return 3 + (EXT_BGR_PIXELSIZE != 3) * 16;
-+  case JCS_EXT_BGRX:
-+  case JCS_EXT_BGRA:
-+    return 4 + (EXT_BGRX_PIXELSIZE != 3) * 16;
-+  case JCS_EXT_XBGR:
-+  case JCS_EXT_ABGR:
-+    return 5 + (EXT_XBGR_PIXELSIZE != 3) * 16;
-+  case JCS_EXT_XRGB:
-+  case JCS_EXT_ARGB:
-+    return 6 + (EXT_XRGB_PIXELSIZE != 3) * 16;
-+  default:
-+    break;
-+  }
-+  return 0 + (RGB_PIXELSIZE != 3) * 16;
-+}
-+
-+GLOBAL(int)
-+jsimd_can_rgb_ycc(void)
-+{
-+  init_simd();
-+
-+  /* The code is optimised for these values only */
-+  if (BITS_IN_JSAMPLE != 8)
-+    return 0;
-+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-+    return 0;
-+
-+  if (simd_support & JSIMD_SSE2)
-+    return 1;
-+
-+  return 0;
-+}
-+
-+GLOBAL(int)
-+jsimd_can_rgb_gray(void)
-+{
-+  init_simd();
-+
-+  /* The code is optimised for these values only */
-+  if (BITS_IN_JSAMPLE != 8)
-+    return 0;
-+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-+    return 0;
-+
-+  if (simd_support & JSIMD_SSE2)
-+    return 1;
-+
-+  return 0;
-+}
-+
-+GLOBAL(int)
-+jsimd_can_ycc_rgb(void)
-+{
-+  init_simd();
-+
-+  /* The code is optimised for these values only */
-+  if (BITS_IN_JSAMPLE != 8)
-+    return 0;
-+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-+    return 0;
-+
-+  if (simd_support & JSIMD_SSE2)
-+    return 1;
-+
-+  return 0;
-+}
-+
-+GLOBAL(int)
-+jsimd_can_ycc_rgb565(void)
-+{
-+  return 0;
-+}
-+
-+GLOBAL(void)
-+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
-+                      JSAMPIMAGE output_buf, JDIMENSION output_row,
-+                      int num_rows)
-+{
-+  void (*e2kfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int, int);
-+  int idx = color_space_idx(cinfo->in_color_space);
-+
-+  e2kfct = idx < 16 ? jsimd_rgb3_ycc_convert_e2k :
-+                      jsimd_rgb4_ycc_convert_e2k;
-+  idx &= 15;
-+
-+  e2kfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows, idx);
-+}
-+
-+GLOBAL(void)
-+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
-+                       JSAMPIMAGE output_buf, JDIMENSION output_row,
-+                       int num_rows)
-+{
-+  void (*e2kfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int, int);
-+  int idx = color_space_idx(cinfo->in_color_space);
-+
-+  e2kfct = idx < 16 ? jsimd_rgb3_gray_convert_e2k :
-+                      jsimd_rgb4_gray_convert_e2k;
-+  idx &= 15;
-+
-+  e2kfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows, idx);
-+}
-+
-+GLOBAL(void)
-+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-+                      JDIMENSION input_row, JSAMPARRAY output_buf,
-+                      int num_rows)
-+{
-+  void (*e2kfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int, int);
-+  int idx = color_space_idx(cinfo->out_color_space);
-+
-+  e2kfct = idx < 16 ? jsimd_ycc_rgb3_convert_e2k :
-+                      jsimd_ycc_rgb4_convert_e2k;
-+  idx &= 15;
-+
-+  e2kfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows, idx);
-+}
-+
-+GLOBAL(void)
-+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-+                         JDIMENSION input_row, JSAMPARRAY output_buf,
-+                         int num_rows)
-+{
-+}
-+
-+GLOBAL(int)
-+jsimd_can_h2v2_downsample(void)
-+{
-+  init_simd();
-+
-+  /* The code is optimised for these values only */
-+  if (BITS_IN_JSAMPLE != 8)
-+    return 0;
-+
-+  if (simd_support & JSIMD_SSE2)
-+    return 1;
-+
-+  return 0;
-+}
-+
-+GLOBAL(int)
-+jsimd_can_h2v1_downsample(void)
-+{
-+  init_simd();
-+
-+  /* The code is optimised for these values only */
-+  if (BITS_IN_JSAMPLE != 8)
-+    return 0;
-+
-+  if (simd_support & JSIMD_SSE2)
-+    return 1;
-+
-+  return 0;
-+}
-+
-+GLOBAL(void)
-+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
-+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
-+{
-+  jsimd_h2v2_downsample_e2k(cinfo->image_width, cinfo->max_v_samp_factor,
-+                                compptr->v_samp_factor,
-+                                compptr->width_in_blocks, input_data,
-+                                output_data);
-+}
-+
-+GLOBAL(void)
-+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
-+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
-+{
-+  jsimd_h2v1_downsample_e2k(cinfo->image_width, cinfo->max_v_samp_factor,
-+                                compptr->v_samp_factor,
-+                                compptr->width_in_blocks, input_data,
-+                                output_data);
-+}
-+
-+GLOBAL(int)
-+jsimd_can_h2v2_upsample(void)
-+{
-+  init_simd();
-+
-+  /* The code is optimised for these values only */
-+  if (BITS_IN_JSAMPLE != 8)
-+    return 0;
-+
-+  if (simd_support & JSIMD_SSE2)
-+    return 1;
-+
-+  return 0;
-+}
-+
-+GLOBAL(int)
-+jsimd_can_h2v1_upsample(void)
-+{
-+  init_simd();
-+
-+  /* The code is optimised for these values only */
-+  if (BITS_IN_JSAMPLE != 8)
-+    return 0;
-+
-+  if (simd_support & JSIMD_SSE2)
-+    return 1;
-+
-+  return 0;
-+}
-+
-+GLOBAL(void)
-+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
-+{
-+  jsimd_h2v2_upsample_e2k(cinfo->max_v_samp_factor, cinfo->output_width,
-+                          input_data, output_data_ptr);
-+}
-+
-+GLOBAL(void)
-+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
-+{
-+  jsimd_h2v1_upsample_e2k(cinfo->max_v_samp_factor, cinfo->output_width,
-+                          input_data, output_data_ptr);
-+}
-+
-+GLOBAL(int)
-+jsimd_can_h2v2_fancy_upsample(void)
-+{
-+  init_simd();
-+
-+  /* The code is optimised for these values only */
-+  if (BITS_IN_JSAMPLE != 8)
-+    return 0;
-+
-+  if (simd_support & JSIMD_SSE2)
-+    return 1;
-+
-+  return 0;
-+}
-+
-+GLOBAL(int)
-+jsimd_can_h2v1_fancy_upsample(void)
-+{
-+  init_simd();
-+
-+  /* The code is optimised for these values only */
-+  if (BITS_IN_JSAMPLE != 8)
-+    return 0;
-+
-+  if (simd_support & JSIMD_SSE2)
-+    return 1;
-+
-+  return 0;
-+}
-+
-+GLOBAL(void)
-+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
-+{
-+  jsimd_h2v2_fancy_upsample_e2k(cinfo->max_v_samp_factor,
-+                                compptr->downsampled_width, input_data,
-+                                output_data_ptr);
-+}
-+
-+GLOBAL(void)
-+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
-+{
-+  jsimd_h2v1_fancy_upsample_e2k(cinfo->max_v_samp_factor,
-+                                compptr->downsampled_width, input_data,
-+                                output_data_ptr);
-+}
-+
-+GLOBAL(int)
-+jsimd_can_h2v2_merged_upsample(void)
-+{
-+  init_simd();
-+
-+  /* The code is optimised for these values only */
-+  if (BITS_IN_JSAMPLE != 8)
-+    return 0;
-+
-+  if (simd_support & JSIMD_SSE2)
-+    return 1;
-+
-+  return 0;
-+}
-+
-+GLOBAL(int)
-+jsimd_can_h2v1_merged_upsample(void)
-+{
-+  init_simd();
-+
-+  /* The code is optimised for these values only */
-+  if (BITS_IN_JSAMPLE != 8)
-+    return 0;
-+
-+  if (simd_support & JSIMD_SSE2)
-+    return 1;
-+
-+  return 0;
-+}
-+
-+GLOBAL(void)
-+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
-+{
-+  void (*e2kfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JDIMENSION, JSAMPARRAY, int);
-+  int idx = color_space_idx(cinfo->out_color_space);
-+
-+  e2kfct = idx < 16 ? jsimd_ycc_rgb3_merged_upsample_e2k :
-+                      jsimd_ycc_rgb4_merged_upsample_e2k;
-+  idx &= 15;
-+
-+  e2kfct(cinfo->output_width, input_buf, in_row_group_ctr,
-+         in_row_group_ctr * 2, output_buf, idx);
-+  e2kfct(cinfo->output_width, input_buf, in_row_group_ctr,
-+         in_row_group_ctr * 2 + 1, output_buf + 1, idx);
-+}
-+
-+GLOBAL(void)
-+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
-+{
-+  void (*e2kfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JDIMENSION, JSAMPARRAY, int);
-+  int idx = color_space_idx(cinfo->out_color_space);
-+
-+  e2kfct = idx < 16 ? jsimd_ycc_rgb3_merged_upsample_e2k :
-+                      jsimd_ycc_rgb4_merged_upsample_e2k;
-+  idx &= 15;
-+
-+  e2kfct(cinfo->output_width, input_buf, in_row_group_ctr,
-+         in_row_group_ctr, output_buf, idx);
-+}
-+
-+GLOBAL(int)
-+jsimd_can_convsamp(void)
-+{
-+  init_simd();
-+
-+  /* The code is optimised for these values only */
-+  if (DCTSIZE != 8)
-+    return 0;
-+  if (BITS_IN_JSAMPLE != 8)
-+    return 0;
-+  if (sizeof(DCTELEM) != 2)
-+    return 0;
-+
-+  if (simd_support & JSIMD_SSE2)
-+    return 1;
-+
-+  return 0;
-+}
-+
-+GLOBAL(int)
-+jsimd_can_convsamp_float(void)
-+{
-+  init_simd();
-+
-+  /* The code is optimised for these values only */
-+  if (DCTSIZE != 8)
-+    return 0;
-+  if (BITS_IN_JSAMPLE != 8)
-+    return 0;
-+  if (sizeof(FAST_FLOAT) != 4)
-+    return 0;
-+
-+  if (simd_support & JSIMD_SSE2)
-+    return 1;
-+
-+  return 0;
-+}
-+
-+GLOBAL(void)
-+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
-+               DCTELEM *workspace)
-+{
-+  jsimd_convsamp_e2k(sample_data, start_col, workspace);
-+}
-+
-+GLOBAL(void)
-+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
-+                     FAST_FLOAT *workspace)
-+{
-+  jsimd_convsamp_float_e2k(sample_data, start_col, workspace);
-+}
-+
-+GLOBAL(int)
-+jsimd_can_fdct_islow(void)
-+{
-+  init_simd();
-+
-+  /* The code is optimised for these values only */
-+  if (DCTSIZE != 8)
-+    return 0;
-+  if (sizeof(DCTELEM) != 2)
-+    return 0;
-+
-+  if (simd_support & JSIMD_SSE2)
-+    return 1;
-+
-+  return 0;
-+}
-+
-+GLOBAL(int)
-+jsimd_can_fdct_ifast(void)
-+{
-+  init_simd();
-+
-+  /* The code is optimised for these values only */
-+  if (DCTSIZE != 8)
-+    return 0;
-+  if (sizeof(DCTELEM) != 2)
-+    return 0;
-+
-+  if (simd_support & JSIMD_SSE2)
-+    return 1;
-+
-+  return 0;
-+}
-+
-+GLOBAL(int)
-+jsimd_can_fdct_float(void)
-+{
-+  init_simd();
-+
-+  /* The code is optimised for these values only */
-+  if (DCTSIZE != 8)
-+    return 0;
-+  if (sizeof(FAST_FLOAT) != 4)
-+    return 0;
-+
-+  if (simd_support & JSIMD_SSE2)
-+    return 1;
-+
-+  return 0;
-+}
-+
-+GLOBAL(void)
-+jsimd_fdct_islow(DCTELEM *data)
-+{
-+  jsimd_fdct_islow_e2k(data);
-+}
-+
-+GLOBAL(void)
-+jsimd_fdct_ifast(DCTELEM *data)
-+{
-+  jsimd_fdct_ifast_e2k(data);
-+}
-+
-+GLOBAL(void)
-+jsimd_fdct_float(FAST_FLOAT *data)
-+{
-+  jsimd_fdct_float_e2k(data);
-+}
-+
-+GLOBAL(int)
-+jsimd_can_quantize(void)
-+{
-+  init_simd();
-+
-+  /* The code is optimised for these values only */
-+  if (DCTSIZE != 8)
-+    return 0;
-+  if (sizeof(JCOEF) != 2)
-+    return 0;
-+  if (sizeof(DCTELEM) != 2)
-+    return 0;
-+
-+  if (simd_support & JSIMD_SSE2)
-+    return 1;
-+
-+  return 0;
-+}
-+
-+GLOBAL(int)
-+jsimd_can_quantize_float(void)
-+{
-+  init_simd();
-+
-+  /* The code is optimised for these values only */
-+  if (DCTSIZE != 8)
-+    return 0;
-+  if (sizeof(JCOEF) != 2)
-+    return 0;
-+  if (sizeof(FAST_FLOAT) != 4)
-+    return 0;
-+
-+  if (simd_support & JSIMD_SSE2)
-+    return 1;
-+
-+  return 0;
-+}
-+
-+GLOBAL(void)
-+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
-+{
-+  jsimd_quantize_e2k(coef_block, divisors, workspace);
-+}
-+
-+GLOBAL(void)
-+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
-+                     FAST_FLOAT *workspace)
-+{
-+  jsimd_quantize_float_e2k(coef_block, divisors, workspace);
-+}
-+
-+GLOBAL(int)
-+jsimd_can_idct_2x2(void)
-+{
-+  return 0;
-+}
-+
-+GLOBAL(int)
-+jsimd_can_idct_4x4(void)
-+{
-+  return 0;
-+}
-+
-+GLOBAL(void)
-+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
-+               JDIMENSION output_col)
-+{
-+}
-+
-+GLOBAL(void)
-+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
-+               JDIMENSION output_col)
-+{
-+}
-+
-+GLOBAL(int)
-+jsimd_can_idct_islow(void)
-+{
-+  init_simd();
-+
-+  /* The code is optimised for these values only */
-+  if (DCTSIZE != 8)
-+    return 0;
-+  if (sizeof(JCOEF) != 2)
-+    return 0;
-+  if (BITS_IN_JSAMPLE != 8)
-+    return 0;
-+  if (sizeof(ISLOW_MULT_TYPE) != 2)
-+    return 0;
-+
-+  if (simd_support & JSIMD_SSE2)
-+    return 1;
-+
-+  return 0;
-+}
-+
-+GLOBAL(int)
-+jsimd_can_idct_ifast(void)
-+{
-+  init_simd();
-+
-+  /* The code is optimised for these values only */
-+  if (DCTSIZE != 8)
-+    return 0;
-+  if (sizeof(JCOEF) != 2)
-+    return 0;
-+  if (BITS_IN_JSAMPLE != 8)
-+    return 0;
-+  if (sizeof(IFAST_MULT_TYPE) != 2)
-+    return 0;
-+  if (IFAST_SCALE_BITS != 2)
-+    return 0;
-+
-+  if (simd_support & JSIMD_SSE2)
-+    return 1;
-+
-+  return 0;
-+}
-+
-+GLOBAL(int)
-+jsimd_can_idct_float(void)
-+{
-+  init_simd();
-+
-+  /* The code is optimised for these values only */
-+  if (DCTSIZE != 8)
-+    return 0;
-+  if (sizeof(JCOEF) != 2)
-+    return 0;
-+  if (BITS_IN_JSAMPLE != 8)
-+    return 0;
-+  if (sizeof(FAST_FLOAT) != 4)
-+    return 0;
-+  if (sizeof(FLOAT_MULT_TYPE) != 4)
-+    return 0;
-+
-+  if (simd_support & JSIMD_SSE2)
-+    return 1;
-+
-+  return 0;
-+}
-+
-+GLOBAL(void)
-+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
-+                 JDIMENSION output_col)
-+{
-+  jsimd_idct_islow_e2k(compptr->dct_table, coef_block, output_buf,
-+                       output_col);
-+}
-+
-+GLOBAL(void)
-+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
-+                 JDIMENSION output_col)
-+{
-+  jsimd_idct_ifast_e2k(compptr->dct_table, coef_block, output_buf,
-+                       output_col);
-+}
-+
-+GLOBAL(void)
-+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
-+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
-+                 JDIMENSION output_col)
-+{
-+  jsimd_idct_float_e2k(compptr->dct_table, coef_block, output_buf,
-+                       output_col);
-+}
-+
-+GLOBAL(int)
-+jsimd_can_huff_encode_one_block(void)
-+{
-+  init_simd();
-+
-+  if (DCTSIZE != 8)
-+    return 0;
-+  if (sizeof(JCOEF) != 2)
-+    return 0;
-+
-+  if ((simd_support & JSIMD_SSE2) && simd_huffman)
-+    return 1;
-+
-+  return 0;
-+}
-+
-+GLOBAL(JOCTET *)
-+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
-+                            int last_dc_val, c_derived_tbl *dctbl,
-+                            c_derived_tbl *actbl)
-+{
-+  return jsimd_huff_encode_one_block_e2k(state, buffer, block, last_dc_val,
-+                                         dctbl, actbl);
-+}
-+
-+GLOBAL(int)
-+jsimd_can_encode_mcu_AC_first_prepare(void)
-+{
-+  init_simd();
-+
-+  if (DCTSIZE != 8)
-+    return 0;
-+  if (sizeof(JCOEF) != 2)
-+    return 0;
-+  if ((simd_support & JSIMD_SSE2) && simd_huffman)
-+    return 1;
-+
-+  return 0;
-+}
-+
-+GLOBAL(void)
-+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
-+                                  const int *jpeg_natural_order_start, int Sl,
-+                                  int Al, UJCOEF *values, size_t *zerobits)
-+{
-+  jsimd_encode_mcu_AC_first_prepare_e2k(block, jpeg_natural_order_start,
-+                                        Sl, Al, (JCOEF*)values, zerobits);
-+}
-+
-+GLOBAL(int)
-+jsimd_can_encode_mcu_AC_refine_prepare(void)
-+{
-+  init_simd();
-+
-+  if (DCTSIZE != 8)
-+    return 0;
-+  if (sizeof(JCOEF) != 2)
-+    return 0;
-+  if ((simd_support & JSIMD_SSE2) && simd_huffman)
-+    return 1;
-+
-+  return 0;
-+}
-+
-+GLOBAL(int)
-+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
-+                                   const int *jpeg_natural_order_start, int Sl,
-+                                   int Al, UJCOEF *absvalues, size_t *bits)
-+{
-+  return jsimd_encode_mcu_AC_refine_prepare_e2k(block,
-+                                                jpeg_natural_order_start,
-+                                                Sl, Al, (JCOEF*)absvalues, bits);
-+}
-diff --git a/simd/e2k/jsimd_api_e2k.h b/simd/e2k/jsimd_api_e2k.h
-new file mode 100644
-index 0000000..d857203
---- /dev/null
-+++ b/simd/e2k/jsimd_api_e2k.h
-@@ -0,0 +1,94 @@
-+/*
-+ * Elbrus optimizations for libjpeg-turbo
-+ *
-+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
-+ *
-+ * This software is provided 'as-is', without any express or implied
-+ * warranty.  In no event will the authors be held liable for any damages
-+ * arising from the use of this software.
-+ *
-+ * Permission is granted to anyone to use this software for any purpose,
-+ * including commercial applications, and to alter it and redistribute it
-+ * freely, subject to the following restrictions:
-+ *
-+ * 1. The origin of this software must not be misrepresented; you must not
-+ *    claim that you wrote the original software. If you use this software
-+ *    in a product, an acknowledgment in the product documentation would be
-+ *    appreciated but is not required.
-+ * 2. Altered source versions must be plainly marked as such, and must not be
-+ *    misrepresented as being the original software.
-+ * 3. This notice may not be removed or altered from any source distribution.
-+ */
-+
-+/* Function declarations */
-+
-+#define CONVERT_DECL(n) \
-+EXTERN(void) jsimd_rgb##n##_ycc_convert_e2k(JDIMENSION img_width, \
-+    JSAMPARRAY input_buf, JSAMPIMAGE output_buf, \
-+    JDIMENSION output_row, int num_rows, int shuf_idx); \
-+EXTERN(void) jsimd_rgb##n##_gray_convert_e2k(JDIMENSION img_width, \
-+    JSAMPARRAY input_buf, JSAMPIMAGE output_buf, \
-+    JDIMENSION output_row, int num_rows, int shuf_idx); \
-+EXTERN(void) jsimd_ycc_rgb##n##_convert_e2k(JDIMENSION out_width, \
-+    JSAMPIMAGE input_buf, JDIMENSION input_row, \
-+    JSAMPARRAY output_buf, int num_rows, int shuf_idx); \
-+EXTERN(void) jsimd_ycc_rgb##n##_convert_e2k(JDIMENSION out_width, \
-+    JSAMPIMAGE input_buf, JDIMENSION input_row, JSAMPARRAY output_buf, \
-+    int num_rows, int shuf_idx); \
-+EXTERN(void) jsimd_ycc_rgb##n##_merged_upsample_e2k(JDIMENSION out_width, \
-+    JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, \
-+    JDIMENSION in_row_group_ctr_y, JSAMPARRAY output_buf, int shuf_idx); \
-+
-+CONVERT_DECL(3)
-+CONVERT_DECL(4)
-+
-+EXTERN(void) jsimd_h2v1_downsample_e2k
-+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
-+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
-+EXTERN(void) jsimd_h2v2_downsample_e2k
-+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
-+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
-+
-+#define UPSAMPLE_DECL(name) \
-+EXTERN(void) jsimd_##name##_upsample_e2k \
-+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, \
-+   JSAMPARRAY *output_data_ptr);
-+
-+UPSAMPLE_DECL(h2v1)
-+UPSAMPLE_DECL(h2v2)
-+UPSAMPLE_DECL(h2v1_fancy)
-+UPSAMPLE_DECL(h2v2_fancy)
-+
-+EXTERN(void) jsimd_convsamp_e2k
-+  (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
-+EXTERN(void) jsimd_convsamp_float_e2k
-+  (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
-+
-+EXTERN(void) jsimd_fdct_islow_e2k(DCTELEM *data);
-+EXTERN(void) jsimd_fdct_ifast_e2k(DCTELEM *data);
-+EXTERN(void) jsimd_fdct_float_e2k(FAST_FLOAT *data);
-+EXTERN(void) jsimd_quantize_e2k
-+  (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
-+EXTERN(void) jsimd_quantize_float_e2k
-+  (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
-+EXTERN(void) jsimd_idct_islow_e2k
-+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-+   JDIMENSION output_col);
-+EXTERN(void) jsimd_idct_ifast_e2k
-+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-+   JDIMENSION output_col);
-+EXTERN(void) jsimd_idct_float_e2k
-+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-+   JDIMENSION output_col);
-+
-+EXTERN(JOCTET *) jsimd_huff_encode_one_block_e2k
-+  (void *state, JOCTET *buffer, JCOEFPTR block,
-+   int last_dc_val, c_derived_tbl *dctbl, c_derived_tbl *actbl);
-+
-+EXTERN(void) jsimd_encode_mcu_AC_first_prepare_e2k
-+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
-+   JCOEF *values, size_t *zerobits);
-+
-+EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_e2k
-+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
-+   JCOEF *absvalues, size_t *bits);
-diff --git a/simd/e2k/jsimd_e2k.h b/simd/e2k/jsimd_e2k.h
-new file mode 100644
-index 0000000..15d6262
---- /dev/null
-+++ b/simd/e2k/jsimd_e2k.h
-@@ -0,0 +1,207 @@
-+/*
-+ * Elbrus optimizations for libjpeg-turbo
-+ *
-+ * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
-+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
-+ *
-+ * This software is provided 'as-is', without any express or implied
-+ * warranty.  In no event will the authors be held liable for any damages
-+ * arising from the use of this software.
-+ *
-+ * Permission is granted to anyone to use this software for any purpose,
-+ * including commercial applications, and to alter it and redistribute it
-+ * freely, subject to the following restrictions:
-+ *
-+ * 1. The origin of this software must not be misrepresented; you must not
-+ *    claim that you wrote the original software. If you use this software
-+ *    in a product, an acknowledgment in the product documentation would be
-+ *    appreciated but is not required.
-+ * 2. Altered source versions must be plainly marked as such, and must not be
-+ *    misrepresented as being the original software.
-+ * 3. This notice may not be removed or altered from any source distribution.
-+ */
-+
-+#define JPEG_INTERNALS
-+#include "../../jinclude.h"
-+#include "../../jpeglib.h"
-+#include "../../jsimd.h"
-+#include "../../jdct.h"
-+#include "../../jsimddct.h"
-+#include "../jsimd.h"
-+#include "jsimd_api_e2k.h"
-+#include <stdint.h>
-+#include <smmintrin.h> /* SSE4.1 */
-+
-+
-+/* Common code */
-+
-+#define __4X2(a, b)  a, b, a, b, a, b, a, b
-+#define ALWAYS_INLINE __attribute__((__always_inline__)) inline
-+
-+#ifdef __e2k__
-+#define PRAGMA_E2K _Pragma
-+#define _mm_shuffle2_pi8(a, b, c) \
-+    ((__m64)__builtin_e2k_pshufb((uint64_t)(b), (uint64_t)(a), (uint64_t)(c)))
-+#define _mm_shuffle2_epi8(a, b, c) \
-+    ((__m128i)__builtin_e2k_qppermb((__v2di)(b), (__v2di)(a), (__v2di)(c)))
-+#define _mm_blendv_pi8(a, b, c) \
-+    ((__m64)__builtin_e2k_pmerge((uint64_t)(a), (uint64_t)(b), (uint64_t)(c)))
-+#else
-+#define PRAGMA_E2K(x)
-+#define _mm_shuffle2_pi8(a, b, c) \
-+    _mm_movepi64_pi64(_mm_shuffle_epi8(_mm_unpacklo_epi64( \
-+        _mm_movpi64_epi64(a), _mm_movpi64_epi64(b)), _mm_movpi64_epi64(c)))
-+#define _mm_shuffle2_epi8(a, b, c) \
-+    _mm_blendv_epi8(_mm_shuffle_epi8(a, c), _mm_shuffle_epi8(b, c), \
-+        _mm_slli_epi16(c, 3))
-+#define _mm_blendv_pi8(a, b, c) \
-+    _mm_movepi64_pi64(_mm_blendv_epi8(_mm_movpi64_epi64(a), \
-+        _mm_movpi64_epi64(b), _mm_movpi64_epi64(c)))
-+
-+#define BITREV_ROUND(c, i) a = (a & c) << i | (a >> i & c);
-+static ALWAYS_INLINE uint64_t __builtin_e2k_bitrevd(uint64_t a) {
-+    BITREV_ROUND(0x5555555555555555ll, 1)
-+    BITREV_ROUND(0x3333333333333333ll, 2)
-+    BITREV_ROUND(0x0F0F0F0F0F0F0F0Fll, 4)
-+    BITREV_ROUND(0x00FF00FF00FF00FFll, 8)
-+    BITREV_ROUND(0x0000FFFF0000FFFFll, 16)
-+    return a >> 32 | a << 32;
-+}
-+
-+static ALWAYS_INLINE uint64_t __builtin_e2k_insfd(uint64_t a, uint64_t b, uint64_t c) {
-+  int n = b & 63;
-+  a = a >> n | a << (64 - n);
-+  return c ^ ((a ^ c) & (~0ll << (b >> 6 & 63)));
-+}
-+#endif
-+
-+#if defined(__iset__) && __iset__ >= 5
-+static ALWAYS_INLINE __m128i _mm_packhi_epi32(__m128i a, __m128i b) {
-+  __m128i index = _mm_setr_epi8(
-+    2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31);
-+  return _mm_shuffle2_epi8(a, b, index);
-+}
-+
-+#define VEC_ISZERO(a) !_mm_cvtsi128_si64(_mm_packs_epi16(a, a))
-+#else
-+static ALWAYS_INLINE __m128i _mm_packhi_epi32(__m128i a, __m128i b) {
-+  union { __m128i v; __m64 d[2]; } l = { a }, h = { b }, x;
-+  __m64 index = _mm_setr_pi8(2, 3, 6, 7, 10, 11, 14, 15);
-+  x.d[0] = _mm_shuffle2_pi8(l.d[0], l.d[1], index);
-+  x.d[1] = _mm_shuffle2_pi8(h.d[0], h.d[1], index);
-+  return x.v;
-+}
-+
-+static ALWAYS_INLINE uint64_t vec_isnonzero(__m128i a) {
-+  __v2di x = (__v2di)a;
-+  return x[0] | x[1];
-+}
-+
-+#define VEC_ISZERO(a) !vec_isnonzero(a)
-+#endif
-+
-+#define VEC_ALIGNR8(a, b) _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b), _mm_castsi128_pd(a), 1))
-+
-+#define TRANSPOSE_FLOAT(a, b, c, d, e, f, g, h) \
-+  tmp0 = _mm_unpacklo_ps(a, b); \
-+  tmp1 = _mm_unpackhi_ps(a, b); \
-+  tmp2 = _mm_unpacklo_ps(c, d); \
-+  tmp3 = _mm_unpackhi_ps(c, d); \
-+  e = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); \
-+  f = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); \
-+  g = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); \
-+  h = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
-+
-+#define TRANSPOSE8(a, b) \
-+  b##0 = _mm_unpacklo_epi8(a##0, a##2); \
-+  b##1 = _mm_unpackhi_epi8(a##0, a##2); \
-+  b##2 = _mm_unpacklo_epi8(a##1, a##3); \
-+  b##3 = _mm_unpackhi_epi8(a##1, a##3);
-+
-+#define TRANSPOSE16(a, b) \
-+  b##0 = _mm_unpacklo_epi16(a##0, a##2); \
-+  b##1 = _mm_unpackhi_epi16(a##0, a##2); \
-+  b##2 = _mm_unpacklo_epi16(a##1, a##3); \
-+  b##3 = _mm_unpackhi_epi16(a##1, a##3); \
-+  b##4 = _mm_unpacklo_epi16(a##4, a##6); \
-+  b##5 = _mm_unpackhi_epi16(a##4, a##6); \
-+  b##6 = _mm_unpacklo_epi16(a##5, a##7); \
-+  b##7 = _mm_unpackhi_epi16(a##5, a##7);
-+
-+#define TRANSPOSE(a, b) \
-+  TRANSPOSE16(a, b) TRANSPOSE16(b, a) \
-+  b##0 = _mm_unpacklo_epi64(a##0, a##4); \
-+  b##1 = _mm_unpackhi_epi64(a##0, a##4); \
-+  b##2 = _mm_unpacklo_epi64(a##1, a##5); \
-+  b##3 = _mm_unpackhi_epi64(a##1, a##5); \
-+  b##4 = _mm_unpacklo_epi64(a##2, a##6); \
-+  b##5 = _mm_unpackhi_epi64(a##2, a##6); \
-+  b##6 = _mm_unpacklo_epi64(a##3, a##7); \
-+  b##7 = _mm_unpackhi_epi64(a##3, a##7);
-+
-+#define IDCT_SAVE() { \
-+  __m128i pb_cj = _mm_set1_epi8((int8_t)CENTERJSAMPLE); \
-+  \
-+  row0 = _mm_xor_si128(_mm_packs_epi16(out0, out1), pb_cj); \
-+  row1 = _mm_xor_si128(_mm_packs_epi16(out2, out3), pb_cj); \
-+  row2 = _mm_xor_si128(_mm_packs_epi16(out4, out5), pb_cj); \
-+  row3 = _mm_xor_si128(_mm_packs_epi16(out6, out7), pb_cj); \
-+  \
-+  TRANSPOSE8(row, col) TRANSPOSE8(col, row) TRANSPOSE8(row, col) \
-+  \
-+  VEC_STL(output_buf[0] + output_col, col0); \
-+  VEC_STH(output_buf[1] + output_col, col0); \
-+  VEC_STL(output_buf[2] + output_col, col1); \
-+  VEC_STH(output_buf[3] + output_col, col1); \
-+  VEC_STL(output_buf[4] + output_col, col2); \
-+  VEC_STH(output_buf[5] + output_col, col2); \
-+  VEC_STL(output_buf[6] + output_col, col3); \
-+  VEC_STH(output_buf[7] + output_col, col3); \
-+}
-+
-+#define IDCT_SPLAT8(col0) { \
-+  row3 = _mm_unpacklo_epi16(col0, col0); \
-+  row7 = _mm_unpackhi_epi16(col0, col0); \
-+  row1 = _mm_unpacklo_epi16(row3, row3); \
-+  row3 = _mm_unpackhi_epi16(row3, row3); \
-+  row5 = _mm_unpacklo_epi16(row7, row7); \
-+  row7 = _mm_unpackhi_epi16(row7, row7); \
-+  row0 = _mm_unpacklo_epi64(row1, row1); \
-+  row1 = _mm_unpackhi_epi64(row1, row1); \
-+  row2 = _mm_unpacklo_epi64(row3, row3); \
-+  row3 = _mm_unpackhi_epi64(row3, row3); \
-+  row4 = _mm_unpacklo_epi64(row5, row5); \
-+  row5 = _mm_unpackhi_epi64(row5, row5); \
-+  row6 = _mm_unpacklo_epi64(row7, row7); \
-+  row7 = _mm_unpackhi_epi64(row7, row7); \
-+}
-+
-+#ifndef min
-+#define min(a, b)  ((a) < (b) ? (a) : (b))
-+#endif
-+
-+#define VEC_LD(a)     _mm_loadu_si128((const __m128i*)(a))
-+#define VEC_ST(a, b)  _mm_storeu_si128((__m128i*)(a), b)
-+#define VEC_LD8(a)    _mm_loadl_epi64((const __m128i*)(a))
-+#define VEC_STL(a, b) _mm_storel_epi64((__m128i*)(a), b)
-+#define VEC_STH(a, b) _mm_storeh_pd((double*)(a), _mm_castsi128_pd(b));
-+#define VEC_SPLAT(v, i) _mm_shuffle_epi8(v, _mm_set1_epi16((i) * 2 | ((i) * 2 + 1) << 8))
-+
-+#if !defined(__iset__) || __iset__ < 5
-+#define NEED_ALIGN8
-+#define ALIGN8_COMMON uint64_t src_shr; __m64 src_tmp0, src_tmp1;
-+#define ALIGN8_VARS(src) __m64 *src##_ptr, src##_next, src##_index;
-+#define ALIGN8_START(ptr, src) \
-+  src_shr = (intptr_t)(ptr - 1) & 7; \
-+  src##_ptr = (__m64*)((intptr_t)(ptr - 1) & -8); \
-+  src##_next = src##_ptr[src_shr == 7]; \
-+  src##_index = _mm_add_pi8(_mm_set1_pi8(src_shr), \
-+                            _mm_setr_pi8(1, 2, 3, 4, 5, 6, 7, 8));
-+#define ALIGN8_READ16(v0, src, i) \
-+  src_tmp1 = src##_ptr[i * 2 + 1]; \
-+  src_tmp0 = _mm_shuffle2_pi8(src##_next, src_tmp1, src##_index); \
-+  src##_next = src##_ptr[i * 2 + 2]; \
-+  src_tmp1 = _mm_shuffle2_pi8(src_tmp1, src##_next, src##_index); \
-+  v0 = _mm_setr_epi64(src_tmp0, src_tmp1);
-+#endif
-+
--- 
-2.34.1
-
diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec
index 751c31c..ebcd6c1 100644
--- a/libjpeg-turbo.spec
+++ b/libjpeg-turbo.spec
@@ -12,12 +12,12 @@
 Summary:	A MMX/SSE2 accelerated library for manipulating JPEG image files
 Name:		libjpeg-turbo
 Epoch:		1
-Version:	3.0.2
+Version:	3.0.3
 Release:	1
 License:	wxWindows Library License
 Group:		Graphics
 Url:		http://www.libjpeg-turbo.org
-Source0:	https://sourceforge.net/projects/libjpeg-turbo/files/%{version}/%{name}-%{version}.tar.gz
+Source0:	https://github.com/libjpeg-turbo/libjpeg-turbo/archive/refs/tags/%{version}.tar.gz
 # These two allow automatic lossless rotation of JPEG images from a digital
 # camera which have orientation markings in the EXIF data. After rotation
 # the orientation markings are reset to avoid duplicate rotation when

From e8720c9ebacda09eeb63d7c8b84cfa658ca4d029 Mon Sep 17 00:00:00 2001
From: Alexander Stefanov <alexander@mezon.ru>
Date: Sat, 18 May 2024 13:05:28 +0000
Subject: [PATCH 29/32] fix e2k patch

---
 libjpeg-turbo-3.0.2-e2k.patch | 4840 +++++++++++++++++++++++++++++++++
 1 file changed, 4840 insertions(+)

diff --git a/libjpeg-turbo-3.0.2-e2k.patch b/libjpeg-turbo-3.0.2-e2k.patch
index 888624d..dd7670b 100644
--- a/libjpeg-turbo-3.0.2-e2k.patch
+++ b/libjpeg-turbo-3.0.2-e2k.patch
@@ -1,3 +1,55 @@
+From f922bcd2542146dfce9bf3a5e49892c69fe1b0a4 Mon Sep 17 00:00:00 2001
+From: Alexander Stefanov <alexander@mezon.ru>
+Date: Sat, 18 May 2024 13:04:48 +0000
+Subject: [PATCH] add e2k port
+
+---
+ CMakeLists.txt           |   5 +
+ simd/CMakeLists.txt      |  23 ++
+ simd/e2k/jccolext-e2k.c  | 213 +++++++++++
+ simd/e2k/jccolor-e2k.c   | 163 +++++++++
+ simd/e2k/jchuff-e2k.c    | 307 ++++++++++++++++
+ simd/e2k/jcphuff-e2k.c   | 145 ++++++++
+ simd/e2k/jcsample-e2k.c  | 203 +++++++++++
+ simd/e2k/jcsample.h      |  28 ++
+ simd/e2k/jdcolext-e2k.c  | 258 +++++++++++++
+ simd/e2k/jdcolor-e2k.c   | 289 +++++++++++++++
+ simd/e2k/jdcoltab-e2k.c  |  80 ++++
+ simd/e2k/jdsample-e2k.c  | 389 ++++++++++++++++++++
+ simd/e2k/jfdctflt-e2k.c  | 127 +++++++
+ simd/e2k/jfdctfst-e2k.c  | 145 ++++++++
+ simd/e2k/jfdctint-e2k.c  | 255 +++++++++++++
+ simd/e2k/jidctflt-e2k.c  | 215 +++++++++++
+ simd/e2k/jidctfst-e2k.c  | 187 ++++++++++
+ simd/e2k/jidctint-e2k.c  | 294 +++++++++++++++
+ simd/e2k/jquantf-e2k.c   | 121 +++++++
+ simd/e2k/jquanti-e2k.c   | 178 +++++++++
+ simd/e2k/jsimd.c         | 761 +++++++++++++++++++++++++++++++++++++++
+ simd/e2k/jsimd_api_e2k.h |  94 +++++
+ simd/e2k/jsimd_e2k.h     | 207 +++++++++++
+ 23 files changed, 4687 insertions(+)
+ create mode 100644 simd/e2k/jccolext-e2k.c
+ create mode 100644 simd/e2k/jccolor-e2k.c
+ create mode 100644 simd/e2k/jchuff-e2k.c
+ create mode 100644 simd/e2k/jcphuff-e2k.c
+ create mode 100644 simd/e2k/jcsample-e2k.c
+ create mode 100644 simd/e2k/jcsample.h
+ create mode 100644 simd/e2k/jdcolext-e2k.c
+ create mode 100644 simd/e2k/jdcolor-e2k.c
+ create mode 100644 simd/e2k/jdcoltab-e2k.c
+ create mode 100644 simd/e2k/jdsample-e2k.c
+ create mode 100644 simd/e2k/jfdctflt-e2k.c
+ create mode 100644 simd/e2k/jfdctfst-e2k.c
+ create mode 100644 simd/e2k/jfdctint-e2k.c
+ create mode 100644 simd/e2k/jidctflt-e2k.c
+ create mode 100644 simd/e2k/jidctfst-e2k.c
+ create mode 100644 simd/e2k/jidctint-e2k.c
+ create mode 100644 simd/e2k/jquantf-e2k.c
+ create mode 100644 simd/e2k/jquanti-e2k.c
+ create mode 100644 simd/e2k/jsimd.c
+ create mode 100644 simd/e2k/jsimd_api_e2k.h
+ create mode 100644 simd/e2k/jsimd_e2k.h
+
 diff --git a/CMakeLists.txt b/CMakeLists.txt
 index ff9c9c2..fa0364c 100644
 --- a/CMakeLists.txt
@@ -55,3 +107,4791 @@ index 0237955..e61baea 100644
  
  ###############################################################################
  # None
+diff --git a/simd/e2k/jccolext-e2k.c b/simd/e2k/jccolext-e2k.c
+new file mode 100644
+index 0000000..49abdb4
+--- /dev/null
++++ b/simd/e2k/jccolext-e2k.c
+@@ -0,0 +1,213 @@
++/*
++ * Elbrus optimizations for libjpeg-turbo
++ *
++ * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
++ * Copyright (C) 2014, Jay Foad.  All Rights Reserved.
++ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
++ *
++ * This software is provided 'as-is', without any express or implied
++ * warranty.  In no event will the authors be held liable for any damages
++ * arising from the use of this software.
++ *
++ * Permission is granted to anyone to use this software for any purpose,
++ * including commercial applications, and to alter it and redistribute it
++ * freely, subject to the following restrictions:
++ *
++ * 1. The origin of this software must not be misrepresented; you must not
++ *    claim that you wrote the original software. If you use this software
++ *    in a product, an acknowledgment in the product documentation would be
++ *    appreciated but is not required.
++ * 2. Altered source versions must be plainly marked as such, and must not be
++ *    misrepresented as being the original software.
++ * 3. This notice may not be removed or altered from any source distribution.
++ */
++
++/* This file is included by jccolor-e2k.c */
++
++void rgbn_ycc_convert(JDIMENSION img_width, JSAMPARRAY input_buf,
++                      JSAMPIMAGE output_buf, JDIMENSION output_row,
++                      int num_rows, int shuf_idx)
++{
++  JSAMPROW inptr, outptr0, outptr1, outptr2;
++  unsigned char __attribute__((aligned(16))) tmpbuf[PIXELSIZE * 16];
++
++  __m128i pb_zero = _mm_setzero_si128();
++  __m128i pb_shuf0 = VEC_LD(rgb_ycc_shuf_const[shuf_idx]);
++#if PIXELSIZE == 4
++  __m128i rgb3 = pb_zero;
++#else
++  __m128i pb_shuf4 = VEC_LD(rgb_ycc_shuf_const[shuf_idx] + 16);
++#endif
++  __m128i rgb0, rgb1 = pb_zero, rgb2 = pb_zero,
++    rgbg0, rgbg1, rgbg2, rgbg3, rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
++  __m128i y, yl, yh, y0, y1, y2, y3;
++  __m128i cb, cr, crl, crh, cbl, cbh;
++  __m128i cr0, cr1, cr2, cr3, cb0, cb1, cb2, cb3;
++
++  /* Constants */
++  __m128i pw_f0299_f0337 = _mm_setr_epi16(__4X2(F_0_299, F_0_337)),
++    pw_f0114_f0250 = _mm_setr_epi16(__4X2(F_0_114, F_0_250)),
++    pw_mf016_mf033 = _mm_setr_epi16(__4X2(-F_0_168, -F_0_331)),
++    pw_mf008_mf041 = _mm_setr_epi16(__4X2(-F_0_081, -F_0_418)),
++    pw_mf050_f000 = _mm_setr_epi16(__4X2(-F_0_500, 0)),
++    pd_onehalf = _mm_set1_epi32(ONE_HALF),
++    pd_onehalfm1_cj = _mm_set1_epi32(ONE_HALF - 1 + (CENTERJSAMPLE << SCALEBITS));
++#ifdef NEED_ALIGN8
++  ALIGN8_COMMON
++  ALIGN8_VARS(src)
++#endif
++
++  if (img_width > 0)
++  while (--num_rows >= 0) {
++    int num_cols;
++    inptr = *input_buf++;
++    outptr0 = output_buf[0][output_row];
++    outptr1 = output_buf[1][output_row];
++    outptr2 = output_buf[2][output_row];
++    output_row++;
++
++    if (img_width >= 16) {
++#ifdef NEED_ALIGN8
++      ALIGN8_START(inptr, src)
++      inptr += (img_width & -16) * PIXELSIZE;
++#endif
++
++      PRAGMA_E2K("ivdep")
++      for (num_cols = img_width; num_cols >= 16; num_cols -= 16,
++           outptr0 += 16, outptr1 += 16, outptr2 += 16) {
++#ifdef NEED_ALIGN8
++        ALIGN8_READ16(rgb0, src, 0)
++        ALIGN8_READ16(rgb1, src, 1)
++        ALIGN8_READ16(rgb2, src, 2)
++#if PIXELSIZE == 4
++        ALIGN8_READ16(rgb3, src, 3)
++#endif
++        src_ptr += PIXELSIZE * 2;
++#else
++        rgb0 = VEC_LD(inptr);
++        rgb1 = VEC_LD(inptr + 16);
++        rgb2 = VEC_LD(inptr + 32);
++#if PIXELSIZE == 4
++        rgb3 = VEC_LD(inptr + 48);
++#endif
++        inptr += PIXELSIZE * 16;
++#endif
++        RGB_SHUFFLE
++        CALC_Y(outptr0)
++        CALC_CC(outptr1, outptr2)
++      }
++    }
++
++    num_cols = img_width & 15;
++    if (num_cols) {
++      int i;
++      memcpy(tmpbuf, inptr, num_cols * PIXELSIZE);
++      rgb0 = VEC_LD(tmpbuf);
++      rgb1 = VEC_LD(tmpbuf + 16);
++      rgb2 = VEC_LD(tmpbuf + 32);
++#if PIXELSIZE == 4
++      rgb3 = VEC_LD(tmpbuf + 48);
++#endif
++      RGB_SHUFFLE
++      CALC_Y(tmpbuf)
++      CALC_CC(tmpbuf + 16, tmpbuf + 32)
++
++      for (i = 0; i < num_cols; i++) {
++        outptr0[i] = tmpbuf[i];
++        outptr1[i] = tmpbuf[i + 16];
++        outptr2[i] = tmpbuf[i + 32];
++      }
++    }
++  }
++}
++
++void rgbn_gray_convert(JDIMENSION img_width, JSAMPARRAY input_buf,
++                       JSAMPIMAGE output_buf, JDIMENSION output_row,
++                       int num_rows, int shuf_idx)
++{
++  JSAMPROW inptr, outptr;
++  uint8_t __attribute__((aligned(16))) tmpbuf[PIXELSIZE * 16];
++
++  __m128i pb_zero = _mm_setzero_si128();
++  __m128i pb_shuf0 = VEC_LD(rgb_ycc_shuf_const[shuf_idx]);
++#if PIXELSIZE == 4
++  __m128i rgb3 = pb_zero;
++#else
++  __m128i pb_shuf4 = VEC_LD(rgb_ycc_shuf_const[shuf_idx] + 16);
++#endif
++  __m128i rgb0, rgb1 = pb_zero, rgb2 = pb_zero,
++    rgbg0, rgbg1, rgbg2, rgbg3, rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
++  __m128i y, yl, yh, y0, y1, y2, y3;
++
++  /* Constants */
++  __m128i pw_f0299_f0337 = _mm_setr_epi16(__4X2(F_0_299, F_0_337)),
++    pw_f0114_f0250 = _mm_setr_epi16(__4X2(F_0_114, F_0_250)),
++    pd_onehalf = _mm_set1_epi32(ONE_HALF);
++#ifdef NEED_ALIGN8
++  ALIGN8_COMMON
++  ALIGN8_VARS(src)
++#endif
++
++  if (img_width > 0)
++  while (--num_rows >= 0) {
++    int num_cols;
++    inptr = *input_buf++;
++    outptr = output_buf[0][output_row];
++    output_row++;
++
++    if (img_width >= 16) {
++#ifdef NEED_ALIGN8
++      ALIGN8_START(inptr, src)
++      inptr += (img_width & -16) * PIXELSIZE;
++#endif
++
++      PRAGMA_E2K("ivdep")
++      for (num_cols = img_width; num_cols >= 16; num_cols -= 16,
++           outptr += 16) {
++#ifdef NEED_ALIGN8
++        ALIGN8_READ16(rgb0, src, 0)
++        ALIGN8_READ16(rgb1, src, 1)
++        ALIGN8_READ16(rgb2, src, 2)
++#if PIXELSIZE == 4
++        ALIGN8_READ16(rgb3, src, 3)
++#endif
++        src_ptr += PIXELSIZE * 2;
++#else
++        rgb0 = VEC_LD(inptr);
++        rgb1 = VEC_LD(inptr + 16);
++        rgb2 = VEC_LD(inptr + 32);
++#if PIXELSIZE == 4
++        rgb3 = VEC_LD(inptr + 48);
++#endif
++        inptr += PIXELSIZE * 16;
++#endif
++        RGB_SHUFFLE
++        CALC_Y(outptr)
++      }
++    }
++
++    num_cols = img_width & 15;
++    if (num_cols) {
++      int i;
++      memcpy(tmpbuf, inptr, num_cols * PIXELSIZE);
++      rgb0 = VEC_LD(tmpbuf);
++      rgb1 = VEC_LD(tmpbuf + 16);
++      rgb2 = VEC_LD(tmpbuf + 32);
++#if PIXELSIZE == 4
++      rgb3 = VEC_LD(tmpbuf + 48);
++#endif
++      RGB_SHUFFLE
++      CALC_Y(tmpbuf)
++
++      for (i = 0; i < num_cols; i++) {
++        outptr[i] = tmpbuf[i];
++      }
++    }
++  }
++}
++
++#undef RGB_SHUFFLE
++#undef PIXELSIZE
++#undef rgbn_ycc_convert
++#undef rgbn_gray_convert
++
+diff --git a/simd/e2k/jccolor-e2k.c b/simd/e2k/jccolor-e2k.c
+new file mode 100644
+index 0000000..0af2626
+--- /dev/null
++++ b/simd/e2k/jccolor-e2k.c
+@@ -0,0 +1,163 @@
++/*
++ * Elbrus optimizations for libjpeg-turbo
++ *
++ * Copyright (C) 2014, D. R. Commander.  All Rights Reserved.
++ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
++ *
++ * This software is provided 'as-is', without any express or implied
++ * warranty.  In no event will the authors be held liable for any damages
++ * arising from the use of this software.
++ *
++ * Permission is granted to anyone to use this software for any purpose,
++ * including commercial applications, and to alter it and redistribute it
++ * freely, subject to the following restrictions:
++ *
++ * 1. The origin of this software must not be misrepresented; you must not
++ *    claim that you wrote the original software. If you use this software
++ *    in a product, an acknowledgment in the product documentation would be
++ *    appreciated but is not required.
++ * 2. Altered source versions must be plainly marked as such, and must not be
++ *    misrepresented as being the original software.
++ * 3. This notice may not be removed or altered from any source distribution.
++ */
++
++/* RGB --> YCC CONVERSION */
++
++#include "jsimd_e2k.h"
++
++#define F_0_081  5329                 /* FIX(0.08131) */
++#define F_0_114  7471                 /* FIX(0.11400) */
++#define F_0_168  11059                /* FIX(0.16874) */
++#define F_0_250  16384                /* FIX(0.25000) */
++#define F_0_299  19595                /* FIX(0.29900) */
++#define F_0_331  21709                /* FIX(0.33126) */
++#define F_0_418  27439                /* FIX(0.41869) */
++#define F_0_500  32768                /* FIX(0.50000) */
++#define F_0_587  38470                /* FIX(0.58700) */
++#define F_0_337  (F_0_587 - F_0_250)  /* FIX(0.58700) - FIX(0.25000) */
++#define F_0_413  (65536 - F_0_587)    /* FIX(1.00000) - FIX(0.58700) */
++
++#define SCALEBITS  16
++#define ONE_HALF  (1 << (SCALEBITS - 1))
++
++#define RGBG_INDEX_(name, color, i, x) \
++  name##_##color + i * name##_PIXELSIZE + x, \
++  name##_GREEN + i * name##_PIXELSIZE + x
++#define RGBG_INDEX(name, x) \
++  RGBG_INDEX_(name, RED, 0, x), RGBG_INDEX_(name, RED, 1, x), \
++  RGBG_INDEX_(name, RED, 2, x), RGBG_INDEX_(name, RED, 3, x), \
++  RGBG_INDEX_(name, BLUE, 0, x), RGBG_INDEX_(name, BLUE, 1, x), \
++  RGBG_INDEX_(name, BLUE, 2, x), RGBG_INDEX_(name, BLUE, 3, x)
++
++static const uint8_t __attribute__((aligned(16)))
++rgb_ycc_shuf_const[7][32] = {
++  { RGBG_INDEX(RGB, 0), RGBG_INDEX(RGB, 4) },
++  { RGBG_INDEX(EXT_RGB, 0), RGBG_INDEX(EXT_RGB, 4) },
++  { RGBG_INDEX(EXT_RGBX, 0), RGBG_INDEX(EXT_RGBX, 4) },
++  { RGBG_INDEX(EXT_BGR, 0), RGBG_INDEX(EXT_BGR, 4) },
++  { RGBG_INDEX(EXT_BGRX, 0), RGBG_INDEX(EXT_BGRX, 4) },
++  { RGBG_INDEX(EXT_XBGR, 0), RGBG_INDEX(EXT_XBGR, 4) },
++  { RGBG_INDEX(EXT_XRGB, 0), RGBG_INDEX(EXT_XRGB, 4) }
++};
++
++  /* rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3 
++   * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
++   * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
++   * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
++   *
++   * rg0 = R0 G0 R1 G1 R2 G2 R3 G3
++   * bg0 = B0 G0 B1 G1 B2 G2 B3 G3
++   * ...
++   */
++
++  /* (Original)
++   * Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
++   * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
++   * Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
++   *
++   * (This implementation)
++   * Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
++   * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
++   * Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
++   */
++
++#define CALC_Y(outptr0) \
++  rg0 = _mm_unpacklo_epi8(rgbg0, pb_zero); \
++  bg0 = _mm_unpackhi_epi8(rgbg0, pb_zero); \
++  rg1 = _mm_unpacklo_epi8(rgbg1, pb_zero); \
++  bg1 = _mm_unpackhi_epi8(rgbg1, pb_zero); \
++  rg2 = _mm_unpacklo_epi8(rgbg2, pb_zero); \
++  bg2 = _mm_unpackhi_epi8(rgbg2, pb_zero); \
++  rg3 = _mm_unpacklo_epi8(rgbg3, pb_zero); \
++  bg3 = _mm_unpackhi_epi8(rgbg3, pb_zero); \
++  \
++  /* Calculate Y values */ \
++  y0 = _mm_add_epi32(_mm_madd_epi16(rg0, pw_f0299_f0337), pd_onehalf); \
++  y1 = _mm_add_epi32(_mm_madd_epi16(rg1, pw_f0299_f0337), pd_onehalf); \
++  y2 = _mm_add_epi32(_mm_madd_epi16(rg2, pw_f0299_f0337), pd_onehalf); \
++  y3 = _mm_add_epi32(_mm_madd_epi16(rg3, pw_f0299_f0337), pd_onehalf); \
++  y0 = _mm_add_epi32(_mm_madd_epi16(bg0, pw_f0114_f0250), y0); \
++  y1 = _mm_add_epi32(_mm_madd_epi16(bg1, pw_f0114_f0250), y1); \
++  y2 = _mm_add_epi32(_mm_madd_epi16(bg2, pw_f0114_f0250), y2); \
++  y3 = _mm_add_epi32(_mm_madd_epi16(bg3, pw_f0114_f0250), y3); \
++  \
++  yl = _mm_packhi_epi32(y0, y1); \
++  yh = _mm_packhi_epi32(y2, y3); \
++  y = _mm_packus_epi16(yl, yh); \
++  VEC_ST(outptr0, y);
++
++#define CALC_CC(outptr1, outptr2) \
++  /* Calculate Cb values */ \
++  cb0 = _mm_add_epi32(_mm_madd_epi16(rg0, pw_mf016_mf033), pd_onehalfm1_cj); \
++  cb1 = _mm_add_epi32(_mm_madd_epi16(rg1, pw_mf016_mf033), pd_onehalfm1_cj); \
++  cb2 = _mm_add_epi32(_mm_madd_epi16(rg2, pw_mf016_mf033), pd_onehalfm1_cj); \
++  cb3 = _mm_add_epi32(_mm_madd_epi16(rg3, pw_mf016_mf033), pd_onehalfm1_cj); \
++  cb0 = _mm_sub_epi32(cb0, _mm_madd_epi16(bg0, pw_mf050_f000)); \
++  cb1 = _mm_sub_epi32(cb1, _mm_madd_epi16(bg1, pw_mf050_f000)); \
++  cb2 = _mm_sub_epi32(cb2, _mm_madd_epi16(bg2, pw_mf050_f000)); \
++  cb3 = _mm_sub_epi32(cb3, _mm_madd_epi16(bg3, pw_mf050_f000)); \
++  \
++  cbl = _mm_packhi_epi32(cb0, cb1); \
++  cbh = _mm_packhi_epi32(cb2, cb3); \
++  cb = _mm_packus_epi16(cbl, cbh); \
++  VEC_ST(outptr1, cb); \
++  \
++  /* Calculate Cr values */ \
++  cr0 = _mm_add_epi32(_mm_madd_epi16(bg0, pw_mf008_mf041), pd_onehalfm1_cj); \
++  cr1 = _mm_add_epi32(_mm_madd_epi16(bg1, pw_mf008_mf041), pd_onehalfm1_cj); \
++  cr2 = _mm_add_epi32(_mm_madd_epi16(bg2, pw_mf008_mf041), pd_onehalfm1_cj); \
++  cr3 = _mm_add_epi32(_mm_madd_epi16(bg3, pw_mf008_mf041), pd_onehalfm1_cj); \
++  cr0 = _mm_sub_epi32(cr0, _mm_madd_epi16(rg0, pw_mf050_f000)); \
++  cr1 = _mm_sub_epi32(cr1, _mm_madd_epi16(rg1, pw_mf050_f000)); \
++  cr2 = _mm_sub_epi32(cr2, _mm_madd_epi16(rg2, pw_mf050_f000)); \
++  cr3 = _mm_sub_epi32(cr3, _mm_madd_epi16(rg3, pw_mf050_f000)); \
++  \
++  crl = _mm_packhi_epi32(cr0, cr1); \
++  crh = _mm_packhi_epi32(cr2, cr3); \
++  cr = _mm_packus_epi16(crl, crh); \
++  VEC_ST(outptr2, cr);
++
++
++#define PIXELSIZE 3
++#define RGB_SHUFFLE \
++  rgbg0 = _mm_shuffle_epi8(rgb0, pb_shuf0); \
++  rgbg1 = _mm_shuffle_epi8(VEC_ALIGNR8(rgb1, rgb0), pb_shuf4); \
++  rgbg2 = _mm_shuffle_epi8(VEC_ALIGNR8(rgb2, rgb1), pb_shuf0); \
++  rgbg3 = _mm_shuffle_epi8(rgb2, pb_shuf4);
++
++#define rgbn_ycc_convert  jsimd_rgb3_ycc_convert_e2k
++#define rgbn_gray_convert jsimd_rgb3_gray_convert_e2k
++#include "jccolext-e2k.c"
++
++
++#define PIXELSIZE 4
++#define RGB_SHUFFLE \
++  rgbg0 = _mm_shuffle_epi8(rgb0, pb_shuf0); \
++  rgbg1 = _mm_shuffle_epi8(rgb1, pb_shuf0); \
++  rgbg2 = _mm_shuffle_epi8(rgb2, pb_shuf0); \
++  rgbg3 = _mm_shuffle_epi8(rgb3, pb_shuf0);
++
++#define rgbn_ycc_convert  jsimd_rgb4_ycc_convert_e2k
++#define rgbn_gray_convert jsimd_rgb4_gray_convert_e2k
++#include "jccolext-e2k.c"
++
+diff --git a/simd/e2k/jchuff-e2k.c b/simd/e2k/jchuff-e2k.c
+new file mode 100644
+index 0000000..ec4329e
+--- /dev/null
++++ b/simd/e2k/jchuff-e2k.c
+@@ -0,0 +1,307 @@
++/*
++ * Elbrus optimizations for libjpeg-turbo
++ *
++ * Copyright (C) 2022, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
++ *
++ * This software is provided 'as-is', without any express or implied
++ * warranty.  In no event will the authors be held liable for any damages
++ * arising from the use of this software.
++ *
++ * Permission is granted to anyone to use this software for any purpose,
++ * including commercial applications, and to alter it and redistribute it
++ * freely, subject to the following restrictions:
++ *
++ * 1. The origin of this software must not be misrepresented; you must not
++ *    claim that you wrote the original software. If you use this software
++ *    in a product, an acknowledgment in the product documentation would be
++ *    appreciated but is not required.
++ * 2. Altered source versions must be plainly marked as such, and must not be
++ *    misrepresented as being the original software.
++ * 3. This notice may not be removed or altered from any source distribution.
++ *
++ * NOTE: All referenced figures are from
++ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
++ */
++
++/* Encode a single block's worth of coefficients */
++
++#include "jsimd_e2k.h"
++
++#if __SIZEOF_SIZE_T__ != 8
++#error
++#endif
++
++typedef unsigned long long bit_buf_type;
++#define BIT_BUF_SIZE  64
++
++typedef struct {
++  bit_buf_type put_buffer;              /* current bit accumulation buffer */
++  int free_bits;                        /* # of bits available in it */
++  int last_dc_val[MAX_COMPS_IN_SCAN];   /* last DC coef for each component */
++} savable_state;
++
++typedef struct {
++  JOCTET *next_output_byte;     /* => next byte to write in buffer */
++  size_t free_in_buffer;        /* # of byte spaces remaining in buffer */
++  savable_state cur;            /* Current bit buffer & DC state */
++  j_compress_ptr cinfo;         /* dump_buffer needs access to this */
++  int simd;
++} working_state;
++
++#define EMIT_BYTE(b) { \
++  buffer[0] = (JOCTET)(b); \
++  buffer[1] = 0; \
++  buffer -= -2 + ((JOCTET)(b) < 0xFF); \
++}
++
++#define FLUSH() { \
++  if (put_buffer & 0x8080808080808080 & ~(put_buffer + 0x0101010101010101)) { \
++    EMIT_BYTE(put_buffer >> 56) \
++    EMIT_BYTE(put_buffer >> 48) \
++    EMIT_BYTE(put_buffer >> 40) \
++    EMIT_BYTE(put_buffer >> 32) \
++    EMIT_BYTE(put_buffer >> 24) \
++    EMIT_BYTE(put_buffer >> 16) \
++    EMIT_BYTE(put_buffer >>  8) \
++    EMIT_BYTE(put_buffer      ) \
++  } else { \
++    *(uint64_t*)buffer = __builtin_bswap64(put_buffer); \
++    buffer += 8; \
++  } \
++}
++
++#define PUT_AND_FLUSH(code, size) { \
++  put_buffer = (put_buffer << (size + free_bits)) | (code >> -free_bits); \
++  FLUSH() \
++  free_bits += BIT_BUF_SIZE; \
++  put_buffer = code; \
++}
++
++#define PUT_BITS(code, size) { \
++  free_bits -= size; \
++  if (free_bits < 0) \
++    PUT_AND_FLUSH(code, size) \
++  else \
++    put_buffer = (put_buffer << size) | code; \
++}
++
++#define PUT_CODE(code, size) { \
++  /* temp &= (((JLONG)1) << nbits) - 1; */ \
++  /* temp |= code << nbits; */ \
++  temp = __builtin_e2k_insfd(code, __builtin_e2k_insfd(nbits, 6 * 63 + 64, -nbits), temp); \
++  nbits += size; \
++  PUT_BITS(temp, nbits) \
++}
++
++#define KLOOP_PREPARE(mask, i) \
++  t0 = _mm_cmpeq_epi8(_mm_packs_epi16(v0, v1), zero); \
++  t1 = _mm_cmpeq_epi8(_mm_packs_epi16(v2, v3), zero); \
++  mask = (uint32_t)(_mm_movemask_epi8(t0) | _mm_movemask_epi8(t1) << 16); \
++  t0 = _mm_add_epi16(_mm_srai_epi16(v0, 15), v0); \
++  t1 = _mm_add_epi16(_mm_srai_epi16(v1, 15), v1); \
++  t2 = _mm_add_epi16(_mm_srai_epi16(v2, 15), v2); \
++  t3 = _mm_add_epi16(_mm_srai_epi16(v3, 15), v3); \
++  v0 = _mm_abs_epi16(v0); \
++  v1 = _mm_abs_epi16(v1); \
++  v2 = _mm_abs_epi16(v2); \
++  v3 = _mm_abs_epi16(v3); \
++  VEC_ST(block_nbits + i,      v0); \
++  VEC_ST(block_nbits + i +  8, v1); \
++  VEC_ST(block_nbits + i + 16, v2); \
++  VEC_ST(block_nbits + i + 24, v3); \
++  VEC_ST(block_diff + i,      t0); \
++  VEC_ST(block_diff + i +  8, t1); \
++  VEC_ST(block_diff + i + 16, t2); \
++  VEC_ST(block_diff + i + 24, t3);
++
++#define SHUF16X4(a, b, c, d) _mm_setr_pi8( \
++  a * 2, a * 2 + 1, b * 2, b * 2 + 1, c * 2, c * 2 + 1, d * 2, d * 2 + 1)
++#define VEC_COMBINE(h0, h1) _mm_unpacklo_epi64( \
++  _mm_movpi64_epi64(h0), _mm_movpi64_epi64(h1))
++#define INSFI_M64(a, b, c, d) _mm_cvtsi64_m64(__builtin_e2k_insfd( \
++  _mm_cvtm64_si64(a), (b & 63) | (d & 63) << 6, _mm_cvtm64_si64(c)))
++
++GLOBAL(JOCTET *)
++jsimd_huff_encode_one_block_e2k(void *state, JOCTET *buffer,
++                                JCOEFPTR block, int last_dc_val,
++                                c_derived_tbl *dctbl, c_derived_tbl *actbl) {
++  uint64_t temp, nbits;
++  uint64_t i, r, code, size;
++  uint64_t code_0xf0 = actbl->ehufco[0xf0];
++  uint64_t size_0xf0 = actbl->ehufsi[0xf0];
++
++  working_state *state_ptr = (working_state*)state;
++  bit_buf_type put_buffer = state_ptr->cur.put_buffer;
++  int64_t free_bits = state_ptr->cur.free_bits;
++
++  __m128i zero = _mm_setzero_si128();
++  __m128i v0, v1, v2, v3, t0, t1, t2, t3;
++  int64_t mask, mask1;
++  uint16_t __attribute__((aligned(16))) block_nbits[DCTSIZE2];
++  int16_t __attribute__((aligned(16))) block_diff[DCTSIZE2];
++
++#if 1 /* faster this way */
++  {
++    __m64 d0l, d0h, d1l, d1h, d2l, d2h, d3l, d3h;
++    __m64 d4l, d4h, d5l, d5h, d6l, d6h, d7l, d7h;
++    __m64 h0, h1, h2, h3, r0, r1, c1256 = SHUF16X4(1, 2, 5, 6);
++
++    d0l = *(__m64*)(block + 8 * 0); d0h = *(__m64*)(block + 8 * 0 + 4); //  0  4
++    d1l = *(__m64*)(block + 8 * 1); d1h = *(__m64*)(block + 8 * 1 + 4); //  8 12
++    d2l = *(__m64*)(block + 8 * 2); d2h = *(__m64*)(block + 8 * 2 + 4); // 16 20
++    d3l = *(__m64*)(block + 8 * 3); d3h = *(__m64*)(block + 8 * 3 + 4); // 24 28
++    d4l = *(__m64*)(block + 8 * 4); d4h = *(__m64*)(block + 8 * 4 + 4); // 32 36
++    d5l = *(__m64*)(block + 8 * 5); d5h = *(__m64*)(block + 8 * 5 + 4); // 40 44
++    d6l = *(__m64*)(block + 8 * 6); d6h = *(__m64*)(block + 8 * 6 + 4); // 48 52
++    d7l = *(__m64*)(block + 8 * 7); d7h = *(__m64*)(block + 8 * 7 + 4); // 56 60
++
++    // d0l[0] d0l[1] d1l[0] d2l[0]
++    // d1l[1] d0l[2] d0l[3] d1l[2]
++    h0 = _mm_unpacklo_pi16(d1l, d2l);
++    r0 = _mm_unpacklo_pi32(d0l, h0);
++    r1 = _mm_shuffle2_pi8(d1l, d0l, SHUF16X4(1, 6, 7, 2));
++    r0 = _mm_sub_pi16(r0, _mm_cvtsi64_m64((uint16_t)last_dc_val));
++    v0 = VEC_COMBINE(r0, r1);
++
++    // d2l[1] d3l[0] d4l[0] d3l[1]
++    // d2l[2] d1l[3] d0h[0] d0h[1]
++    h0 = _mm_srli_si64(_mm_unpacklo_pi32(d2l, d4l), 16);
++    h2 = INSFI_M64(d1l, 0, d2l, 48);
++    r0 = _mm_unpacklo_pi16(h0, d3l);
++    r1 = _mm_alignr_pi8(d0h, h2, 4);
++    v1 = VEC_COMBINE(r0, r1);
++
++    // d1h[0] d2l[3] d3l[2] d4l[1]
++    // d5l[0] d6l[0] d5l[1] d4l[2]
++    h0 = INSFI_M64(d2l, 32, d1h, 16);
++    h1 = INSFI_M64(d4l, -32, d3l, 48);
++    h2 = INSFI_M64(d4l, 16, d6l, 16);
++    r0 = INSFI_M64(h1, 0, h0, 32);
++    r1 = _mm_unpacklo_pi16(d5l, h2);
++    v2 = VEC_COMBINE(r0, r1);
++
++    // d3l[3] d2h[0] d1h[1] d0h[2]
++    // d0h[3] d1h[2] d2h[1] d3h[0]
++    h0 = _mm_alignr_pi8(d2h, d3l, 6);
++    h1 = INSFI_M64(d0h, 0, d1h, 32);
++    h2 = _mm_unpackhi_pi32(d0h, d1h);
++    h3 = _mm_unpacklo_pi32(d2h, d3h);
++    r0 = INSFI_M64(h1, -16, h0, 32);
++    r1 = _mm_shuffle2_pi8(h2, h3, c1256);
++    v3 = VEC_COMBINE(r0, r1);
++
++    KLOOP_PREPARE(mask, 0)
++
++    // d4l[3] d5l[2] d6l[1] d7l[0]
++    // d7l[1] d6l[2] d5l[3] d4h[0]
++    h0 = _mm_unpackhi_pi32(d4l, d5l);
++    h1 = _mm_unpacklo_pi32(d6l, d7l);
++    h2 = INSFI_M64(d6l, 0, d7l, 32);
++    h2 = INSFI_M64(d5l, 0, h2, 48);
++    r0 = _mm_shuffle2_pi8(h0, h1, c1256);
++    r1 = _mm_alignr_pi8(d4h, h2, 2);
++    v0 = VEC_COMBINE(r0, r1);
++
++    // d3h[1] d2h[2] d1h[3] d2h[3]
++    // d3h[2] d4h[1] d5h[0] d6l[3]
++    h0 = _mm_slli_si64(INSFI_M64(d1h, 16, d3h, 32), 16);
++    h2 = INSFI_M64(d4h, -32, d3h, 48);
++    h3 = INSFI_M64(d6l, 32, d5h, 16);
++    r0 = _mm_unpackhi_pi16(h0, d2h);
++    r1 = _mm_alignr_pi8(h3, h2, 4);
++    v1 = VEC_COMBINE(r0, r1);
++
++    // d7l[2] d7l[3] d6h[0] d5h[1]
++    // d4h[2] d3h[3] d4h[3] d5h[2]
++    h0 = INSFI_M64(d5h, 0, d6h, 16);
++    h2 = _mm_slli_si64(_mm_unpackhi_pi32(d3h, d5h), 16);
++    r0 = _mm_alignr_pi8(h0, d7l, 4);
++    r1 = _mm_unpackhi_pi16(d4h, h2);
++    v2 = VEC_COMBINE(r0, r1);
++
++    // d6h[1] d7h[0] d7h[1] d6h[2]
++    // d5h[3] d6h[3] d7h[2] d7h[3]
++    h0 = INSFI_M64(d6h, -16, d7h, 32);
++    h2 = _mm_unpackhi_pi16(d5h, d6h);
++    r0 = _mm_shuffle_pi16(h0, 0xd2);
++    r1 = _mm_unpackhi_pi32(h2, d7h);
++    v3 = VEC_COMBINE(r0, r1);
++  }
++#else
++  v0 = _mm_setr_epi16(
++      block[0] - last_dc_val, block[1], block[8], block[16],
++      block[9], block[2], block[3], block[10]);
++  v1 = _mm_setr_epi16(
++      block[17], block[24], block[32], block[25],
++      block[18], block[11], block[4], block[5]);
++  v2 = _mm_setr_epi16(
++      block[12], block[19], block[26], block[33],
++      block[40], block[48], block[41], block[34]);
++  v3 = _mm_setr_epi16(
++      block[27], block[20], block[13], block[6],
++      block[7], block[14], block[21], block[28]);
++
++  KLOOP_PREPARE(mask, 0)
++
++  v0 = _mm_setr_epi16(
++      block[35], block[42], block[49], block[56],
++      block[57], block[50], block[43], block[36]);
++  v1 = _mm_setr_epi16(
++      block[29], block[22], block[15], block[23],
++      block[30], block[37], block[44], block[51]);
++  v2 = _mm_setr_epi16(
++      block[58], block[59], block[52], block[45],
++      block[38], block[31], block[39], block[46]);
++  v3 = _mm_setr_epi16(
++      block[53], block[60], block[61], block[54],
++      block[47], block[55], block[62], block[63]);
++#endif
++
++  KLOOP_PREPARE(mask1, 32)
++  mask |= mask1 << 32;
++  mask = ~mask;
++
++  /* Encode the DC coefficient difference per section F.1.2.1 */
++
++  nbits = block_nbits[0];
++  temp = block_diff[0];
++  nbits = nbits ? 32 - __builtin_clz(nbits) : 0;
++
++  /* Emit the Huffman-coded symbol for the number of bits */
++  code = dctbl->ehufco[nbits];
++  size = dctbl->ehufsi[nbits];
++  PUT_CODE(code, size)
++
++  /* Encode the AC coefficients per section F.1.2.2 */
++
++  /* e2k doesn't have a tzcnt instruction */
++  mask = __builtin_e2k_bitrevd(mask) << 1;
++
++  for (i = 1; mask; i++, mask <<= 1) {
++    r = __builtin_clzll(mask);
++    mask <<= r;
++    i += r;
++    /* if run length > 15, must emit special run-length-16 codes (0xF0) */
++    while (r > 15) {
++      PUT_BITS(code_0xf0, size_0xf0)
++      r -= 16;
++    }
++    nbits = block_nbits[i];
++    temp = block_diff[i];
++    nbits = 32 - __builtin_clz(nbits);
++    /* Emit Huffman symbol for run length / number of bits */
++    /* r = r << 4 | nbits; */
++    r = __builtin_e2k_insfd(r, 4 * 63 + 64, nbits);
++    code = actbl->ehufco[r];
++    size = actbl->ehufsi[r];
++    PUT_CODE(code, size)
++  }
++
++  if (i != 64) {
++    PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0])
++  }
++
++  state_ptr->cur.put_buffer = put_buffer;
++  state_ptr->cur.free_bits = free_bits;
++  return buffer;
++}
+diff --git a/simd/e2k/jcphuff-e2k.c b/simd/e2k/jcphuff-e2k.c
+new file mode 100644
+index 0000000..f69afeb
+--- /dev/null
++++ b/simd/e2k/jcphuff-e2k.c
+@@ -0,0 +1,145 @@
++/*
++ * Elbrus optimizations for libjpeg-turbo
++ *
++ * Copyright (C) 2022, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
++ *
++ * This software is provided 'as-is', without any express or implied
++ * warranty.  In no event will the authors be held liable for any damages
++ * arising from the use of this software.
++ *
++ * Permission is granted to anyone to use this software for any purpose,
++ * including commercial applications, and to alter it and redistribute it
++ * freely, subject to the following restrictions:
++ *
++ * 1. The origin of this software must not be misrepresented; you must not
++ *    claim that you wrote the original software. If you use this software
++ *    in a product, an acknowledgment in the product documentation would be
++ *    appreciated but is not required.
++ * 2. Altered source versions must be plainly marked as such, and must not be
++ *    misrepresented as being the original software.
++ * 3. This notice may not be removed or altered from any source distribution.
++ */
++
++#include "jsimd_e2k.h"
++
++#define X(i) coefs[i] = block[jpeg_natural_order_start[i]];
++#define Y(i) coefs[i] = i < rem ? block[jpeg_natural_order_start[i]] : 0;
++
++#define LOOP \
++  for (i = 0; i < Sl >> 4; i++) { \
++    X(0) X(1) X(2) X(3) X(4) X(5) X(6) X(7) \
++    X(8) X(9) X(10) X(11) X(12) X(13) X(14) X(15) \
++    BLOCK16 \
++    jpeg_natural_order_start += 16; \
++  } \
++  rem = Sl & 15; \
++  if (Sl & 8) { \
++    X(0) X(1) X(2) X(3) X(4) X(5) X(6) X(7) \
++    Y(8) Y(9) Y(10) Y(11) Y(12) Y(13) Y(14) \
++    coefs[15] = 0; \
++    BLOCK16 \
++  } else if (rem > 0) { \
++    Y(0) Y(1) Y(2) Y(3) Y(4) Y(5) Y(6) Y(7) \
++    BLOCK8 \
++  }
++
++void jsimd_encode_mcu_AC_first_prepare_e2k
++  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
++   JCOEF *values, size_t *zerobits)
++{
++  JCOEF *diff = values + DCTSIZE2;
++  int16_t __attribute__((aligned(16))) coefs[16];
++  __m128i v0, v1, v2, v3;
++  __m128i c0 = _mm_setzero_si128(), shr = _mm_cvtsi32_si128(Al);
++  int i, rem;
++
++#define BLOCK16 \
++  v0 = _mm_load_si128((__m128i*)coefs); \
++  v1 = _mm_load_si128((__m128i*)coefs + 1); \
++  v2 = _mm_srai_epi16(v0, 15); \
++  v3 = _mm_srai_epi16(v1, 15); \
++  v0 = _mm_sra_epi16(_mm_abs_epi16(v0), shr); \
++  v1 = _mm_sra_epi16(_mm_abs_epi16(v1), shr); \
++  v2 = _mm_xor_si128(v0, v2); \
++  v3 = _mm_xor_si128(v1, v3); \
++  _mm_store_si128((__m128i*)values, v0); \
++  _mm_store_si128((__m128i*)values + 1, v1); \
++  _mm_store_si128((__m128i*)diff, v2); \
++  _mm_store_si128((__m128i*)diff + 1, v3); \
++  v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, v1), c0); \
++  ((uint16_t*)zerobits)[i] = ~_mm_movemask_epi8(v2); \
++  values += 16; diff += 16;
++
++#define BLOCK8 \
++  v0 = _mm_load_si128((__m128i*)coefs); \
++  v2 = _mm_srai_epi16(v0, 15); \
++  v0 = _mm_sra_epi16(_mm_abs_epi16(v0), shr); \
++  v2 = _mm_xor_si128(v0, v2); \
++  _mm_store_si128((__m128i*)values, v0); \
++  _mm_store_si128((__m128i*)diff, v2); \
++  v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, c0), c0); \
++  ((uint16_t*)zerobits)[i] = ~_mm_movemask_epi8(v2); \
++  values += 8; diff += 8;
++
++  ((uint64_t*)zerobits)[0] = 0;
++  LOOP
++#undef BLOCK16
++#undef BLOCK8
++
++  for (i = (64 - Sl) >> 3; i; i--) {
++    _mm_store_si128((__m128i*)values, c0);
++    _mm_store_si128((__m128i*)diff, c0);
++    values += 8; diff += 8;
++  }
++}
++
++int jsimd_encode_mcu_AC_refine_prepare_e2k
++  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
++   JCOEF *absvalues, size_t *bits)
++{
++  union { uint64_t q; uint16_t w[4]; } mask1 = { 0 };
++  int16_t __attribute__((aligned(16))) coefs[16];
++  __m128i v0, v1, v2, c1 = _mm_set1_epi8(1);
++  __m128i c0 = _mm_setzero_si128(), shr = _mm_cvtsi32_si128(Al);
++  int i, rem;
++
++#define BLOCK16 \
++  v0 = _mm_load_si128((__m128i*)coefs); \
++  v1 = _mm_load_si128((__m128i*)coefs + 1); \
++  v2 = _mm_packs_epi16(v0, v1); \
++  ((uint16_t*)bits)[4 + i] = ~_mm_movemask_epi8(v2); \
++  v0 = _mm_sra_epi16(_mm_abs_epi16(v0), shr); \
++  v1 = _mm_sra_epi16(_mm_abs_epi16(v1), shr); \
++  v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, v1), c0); \
++  ((uint16_t*)bits)[i] = ~_mm_movemask_epi8(v2); \
++  _mm_store_si128((__m128i*)absvalues, v0); \
++  _mm_store_si128((__m128i*)absvalues + 1, v1); \
++  v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, v1), c1); \
++  mask1.w[i] = _mm_movemask_epi8(v2); \
++  absvalues += 16;
++
++#define BLOCK8 \
++  v0 = _mm_load_si128((__m128i*)coefs); \
++  v2 = _mm_packs_epi16(v0, c0); \
++  ((uint16_t*)bits)[4 + i] = ~_mm_movemask_epi8(v2); \
++  v0 = _mm_sra_epi16(_mm_abs_epi16(v0), shr); \
++  v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, c0), c0); \
++  ((uint16_t*)bits)[i] = ~_mm_movemask_epi8(v2); \
++  _mm_store_si128((__m128i*)absvalues, v0); \
++  v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, c0), c1); \
++  mask1.w[i] = _mm_movemask_epi8(v2); \
++  absvalues += 8;
++
++  ((uint64_t*)bits)[0] = 0; /* zero */
++  ((uint64_t*)bits)[1] = 0; /* sign */
++  LOOP
++#undef BLOCK16
++#undef BLOCK8
++
++  for (i = (64 - Sl) >> 3; i; i--) {
++    _mm_store_si128((__m128i*)absvalues, c0);
++    absvalues += 8;
++  }
++
++  return 63 - __builtin_clzll(mask1.q | 1);
++}
+diff --git a/simd/e2k/jcsample-e2k.c b/simd/e2k/jcsample-e2k.c
+new file mode 100644
+index 0000000..cac8897
+--- /dev/null
++++ b/simd/e2k/jcsample-e2k.c
+@@ -0,0 +1,203 @@
++/*
++ * Elbrus optimizations for libjpeg-turbo
++ *
++ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
++ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
++ *
++ * This software is provided 'as-is', without any express or implied
++ * warranty.  In no event will the authors be held liable for any damages
++ * arising from the use of this software.
++ *
++ * Permission is granted to anyone to use this software for any purpose,
++ * including commercial applications, and to alter it and redistribute it
++ * freely, subject to the following restrictions:
++ *
++ * 1. The origin of this software must not be misrepresented; you must not
++ *    claim that you wrote the original software. If you use this software
++ *    in a product, an acknowledgment in the product documentation would be
++ *    appreciated but is not required.
++ * 2. Altered source versions must be plainly marked as such, and must not be
++ *    misrepresented as being the original software.
++ * 3. This notice may not be removed or altered from any source distribution.
++ */
++
++/* CHROMA DOWNSAMPLING */
++
++#include "jsimd_e2k.h"
++#include "jcsample.h"
++
++void jsimd_h2v1_downsample_e2k(JDIMENSION image_width,
++                               int max_v_samp_factor,
++                               JDIMENSION v_samp_factor,
++                               JDIMENSION width_in_blocks,
++                               JSAMPARRAY input_data,
++                               JSAMPARRAY output_data)
++{
++  int outcol;
++  JDIMENSION output_cols = width_in_blocks * DCTSIZE, outrow;
++  JSAMPROW inptr, outptr;
++
++  __m128i this0, next0, out;
++  __m128i this0e, this0o, next0e, next0o, outl, outh;
++
++  /* Constants */
++  __m128i pw_bias = _mm_set1_epi32(1 << 16),
++    even_mask = _mm_set1_epi16(255);
++#ifdef NEED_ALIGN8
++  ALIGN8_COMMON
++  ALIGN8_VARS(src)
++#endif
++
++  expand_right_edge(input_data, max_v_samp_factor,
++                    image_width, output_cols * 2);
++
++  if (output_cols > 0)
++  for (outrow = 0; outrow < v_samp_factor; outrow++) {
++    outptr = output_data[outrow];
++    inptr = input_data[outrow];
++
++#ifdef NEED_ALIGN8
++    ALIGN8_START(inptr, src)
++#endif
++    PRAGMA_E2K("ivdep")
++    for (outcol = output_cols; outcol > 8;
++         outcol -= 16, outptr += 16) {
++
++#ifdef NEED_ALIGN8
++      ALIGN8_READ16(this0, src, 0)
++      ALIGN8_READ16(next0, src, 1)
++      src_ptr += 4;
++#else
++      this0 = VEC_LD(inptr);
++      next0 = VEC_LD(inptr + 16);
++      inptr += 32;
++#endif
++      this0e = _mm_and_si128(this0, even_mask);
++      this0o = _mm_srli_epi16(this0, 8);
++      outl = _mm_add_epi16(this0e, this0o);
++      outl = _mm_srli_epi16(_mm_add_epi16(outl, pw_bias), 1);
++      next0e = _mm_and_si128(next0, even_mask);
++      next0o = _mm_srli_epi16(next0, 8);
++      outh = _mm_add_epi16(next0e, next0o);
++      outh = _mm_srli_epi16(_mm_add_epi16(outh, pw_bias), 1);
++
++      out = _mm_packus_epi16(outl, outh);
++      VEC_ST(outptr, out);
++    }
++    if (outcol > 0) {
++#ifdef NEED_ALIGN8
++      ALIGN8_READ16(this0, src, 0)
++#else
++      this0 = VEC_LD(inptr);
++#endif
++      this0e = _mm_and_si128(this0, even_mask);
++      this0o = _mm_srli_epi16(this0, 8);
++      outl = _mm_add_epi16(this0e, this0o);
++      outl = _mm_srli_epi16(_mm_add_epi16(outl, pw_bias), 1);
++
++      out = _mm_packus_epi16(outl, outl);
++      VEC_STL(outptr, out);
++    }
++  }
++}
++
++
++void jsimd_h2v2_downsample_e2k(JDIMENSION image_width, int max_v_samp_factor,
++                               JDIMENSION v_samp_factor,
++                               JDIMENSION width_in_blocks,
++                               JSAMPARRAY input_data, JSAMPARRAY output_data)
++{
++  int outcol;
++  JDIMENSION output_cols = width_in_blocks * DCTSIZE, outrow;
++  JSAMPROW inptr0, inptr1, outptr;
++
++  __m128i this0, next0, this1, next1, out;
++  __m128i this0e, this0o, next0e, next0o, this1e, this1o,
++    next1e, next1o, out0l, out0h, out1l, out1h, outl, outh;
++
++  /* Constants */
++  __m128i pw_bias = _mm_set1_epi32(1 | 2 << 16),
++    even_mask = _mm_set1_epi16(255);
++#ifdef NEED_ALIGN8
++  ALIGN8_COMMON
++  ALIGN8_VARS(src0)
++  ALIGN8_VARS(src1)
++#endif
++
++  expand_right_edge(input_data, max_v_samp_factor,
++                    image_width, output_cols * 2);
++
++  if (output_cols > 0)
++  for (outrow = 0; outrow < v_samp_factor; outrow++) {
++    inptr0 = input_data[outrow * 2];
++    inptr1 = input_data[outrow * 2 + 1];
++    outptr = output_data[outrow];
++
++#ifdef NEED_ALIGN8
++    ALIGN8_START(inptr0, src0)
++    ALIGN8_START(inptr1, src1)
++#endif
++    PRAGMA_E2K("ivdep")
++    for (outcol = output_cols; outcol > 8;
++         outcol -= 16, outptr += 16) {
++
++#ifdef NEED_ALIGN8
++      ALIGN8_READ16(this0, src0, 0) src0_ptr += 2;
++      ALIGN8_READ16(this1, src1, 0) src1_ptr += 2;
++#else
++      this0 = VEC_LD(inptr0); inptr0 += 16;
++      this1 = VEC_LD(inptr1); inptr1 += 16;
++#endif
++      this0e = _mm_and_si128(this0, even_mask);
++      this1e = _mm_and_si128(this1, even_mask);
++      this0o = _mm_srli_epi16(this0, 8);
++      this1o = _mm_srli_epi16(this1, 8);
++      out0l = _mm_add_epi16(this0e, this0o);
++      out1l = _mm_add_epi16(this1e, this1o);
++
++      outl = _mm_add_epi16(out0l, out1l);
++      outl = _mm_srli_epi16(_mm_add_epi16(outl, pw_bias), 2);
++
++#ifdef NEED_ALIGN8
++      ALIGN8_READ16(next0, src0, 0) src0_ptr += 2;
++      ALIGN8_READ16(next1, src1, 0) src1_ptr += 2;
++#else
++      next0 = VEC_LD(inptr0); inptr0 += 16;
++      next1 = VEC_LD(inptr1); inptr1 += 16;
++#endif
++      next0e = _mm_and_si128(next0, even_mask);
++      next1e = _mm_and_si128(next1, even_mask);
++      next0o = _mm_srli_epi16(next0, 8);
++      next1o = _mm_srli_epi16(next1, 8);
++      out0h = _mm_add_epi16(next0e, next0o);
++      out1h = _mm_add_epi16(next1e, next1o);
++
++      outh = _mm_add_epi16(out0h, out1h);
++      outh = _mm_srli_epi16(_mm_add_epi16(outh, pw_bias), 2);
++
++      out = _mm_packus_epi16(outl, outh);
++      VEC_ST(outptr, out);
++    }
++    if (outcol > 0) {
++#ifdef NEED_ALIGN8
++      ALIGN8_READ16(this0, src0, 0)
++      ALIGN8_READ16(this1, src1, 0)
++#else
++      this0 = VEC_LD(inptr0);
++      this1 = VEC_LD(inptr1);
++#endif
++      this0e = _mm_and_si128(this0, even_mask);
++      this1e = _mm_and_si128(this1, even_mask);
++      this0o = _mm_srli_epi16(this0, 8);
++      this1o = _mm_srli_epi16(this1, 8);
++      out0l = _mm_add_epi16(this0e, this0o);
++      out1l = _mm_add_epi16(this1e, this1o);
++
++      outl = _mm_add_epi16(out0l, out1l);
++      outl = _mm_srli_epi16(_mm_add_epi16(outl, pw_bias), 2);
++
++      out = _mm_packus_epi16(outl, outl);
++      VEC_STL(outptr, out);
++    }
++  }
++}
+diff --git a/simd/e2k/jcsample.h b/simd/e2k/jcsample.h
+new file mode 100644
+index 0000000..2ac4816
+--- /dev/null
++++ b/simd/e2k/jcsample.h
+@@ -0,0 +1,28 @@
++/*
++ * jcsample.h
++ *
++ * This file was part of the Independent JPEG Group's software:
++ * Copyright (C) 1991-1996, Thomas G. Lane.
++ * For conditions of distribution and use, see the accompanying README.ijg
++ * file.
++ */
++
++LOCAL(void)
++expand_right_edge(JSAMPARRAY image_data, int num_rows, JDIMENSION input_cols,
++                  JDIMENSION output_cols)
++{
++  register JSAMPROW ptr;
++  register JSAMPLE pixval;
++  register int count;
++  int row;
++  int numcols = (int)(output_cols - input_cols);
++
++  if (numcols > 0) {
++    for (row = 0; row < num_rows; row++) {
++      ptr = image_data[row] + input_cols;
++      pixval = ptr[-1];         /* don't need GETJSAMPLE() here */
++      for (count = numcols; count > 0; count--)
++        *ptr++ = pixval;
++    }
++  }
++}
+diff --git a/simd/e2k/jdcolext-e2k.c b/simd/e2k/jdcolext-e2k.c
+new file mode 100644
+index 0000000..4f12aef
+--- /dev/null
++++ b/simd/e2k/jdcolext-e2k.c
+@@ -0,0 +1,258 @@
++/*
++ * Elbrus optimizations for libjpeg-turbo
++ *
++ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
++ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
++ *
++ * This software is provided 'as-is', without any express or implied
++ * warranty.  In no event will the authors be held liable for any damages
++ * arising from the use of this software.
++ *
++ * Permission is granted to anyone to use this software for any purpose,
++ * including commercial applications, and to alter it and redistribute it
++ * freely, subject to the following restrictions:
++ *
++ * 1. The origin of this software must not be misrepresented; you must not
++ *    claim that you wrote the original software. If you use this software
++ *    in a product, an acknowledgment in the product documentation would be
++ *    appreciated but is not required.
++ * 2. Altered source versions must be plainly marked as such, and must not be
++ *    misrepresented as being the original software.
++ * 3. This notice may not be removed or altered from any source distribution.
++ */
++
++/* This file is included by jdcolor-e2k.c */
++
++void ycc_rgbn_convert(JDIMENSION out_width, JSAMPIMAGE input_buf,
++                      JDIMENSION input_row, JSAMPARRAY output_buf,
++                      int num_rows, int shuf_idx)
++{
++  JSAMPROW outptr, inptr0, inptr1, inptr2;
++  uint8_t __attribute__((aligned(16))) tmpbuf[PIXELSIZE * 16];
++
++  __m128i rgb0, rgb1, rgb2, rgb3, y, cb, cr;
++  __m128i rg0, rg1, bx0, bx1, yl, yh, cbl, cbh,
++    crl, crh, rl, rh, gl, gh, bl, bh, g0w, g1w, g2w, g3w;
++  __m128i g0, g1, g2, g3;
++
++  /* Constants
++   * NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17
++   * high-order bits, not 16.
++   */
++  __m128i pw_f0402 = _mm_set1_epi16(F_0_402 >> 1),
++    pw_mf0228 = _mm_set1_epi16(-F_0_228 >> 1),
++    pw_mf0344_f0285 = _mm_setr_epi16(__4X2(-F_0_344, F_0_285)),
++    pb_255 = _mm_set1_epi8(-1),
++    pw_cj = _mm_set1_epi16(CENTERJSAMPLE),
++    pd_onehalf = _mm_set1_epi32(ONE_HALF),
++    pb_zero = _mm_setzero_si128();
++  RGB_SHUFFLE_INIT
++#ifdef NEED_ALIGN8
++  ALIGN8_COMMON
++  ALIGN8_VARS(src0)
++  ALIGN8_VARS(src1)
++  ALIGN8_VARS(src2)
++#endif
++
++  if (out_width > 0)
++  while (--num_rows >= 0) {
++    int num_cols;
++    inptr0 = input_buf[0][input_row];
++    inptr1 = input_buf[1][input_row];
++    inptr2 = input_buf[2][input_row];
++    input_row++;
++    outptr = *output_buf++;
++
++    if (out_width >= 16) {
++#ifdef NEED_ALIGN8
++      ALIGN8_START(inptr0, src0)
++      ALIGN8_START(inptr1, src1)
++      ALIGN8_START(inptr2, src2)
++      inptr0 += out_width & -16;
++      inptr1 += out_width & -16;
++      inptr2 += out_width & -16;
++#endif
++      PRAGMA_E2K("ivdep")
++      for (num_cols = out_width; num_cols >= 16;
++           num_cols -= 16, outptr += PIXELSIZE * 16) {
++#ifdef NEED_ALIGN8
++        ALIGN8_READ16(y, src0, 0) src0_ptr += 2;
++        ALIGN8_READ16(cb, src1, 0) src1_ptr += 2;
++        ALIGN8_READ16(cr, src2, 0) src2_ptr += 2;
++#else
++        y = VEC_LD(inptr0); inptr0 += 16;
++        cb = VEC_LD(inptr1); inptr1 += 16;
++        cr = VEC_LD(inptr2); inptr2 += 16;
++#endif
++        CALC_RGB
++        RGB_SHUFFLE
++        VEC_ST(outptr, rgb0);
++        VEC_ST(outptr + 16, rgb1);
++        VEC_ST(outptr + 32, rgb2);
++#if PIXELSIZE == 4
++        VEC_ST(outptr + 48, rgb3);
++#endif
++      }
++    }
++
++    num_cols = out_width & 15;
++    if (num_cols) {
++      int i;
++      for (i = 0; i < num_cols; i++) {
++        tmpbuf[i] = inptr0[i];
++        tmpbuf[i + 16] = inptr1[i];
++        tmpbuf[i + 32] = inptr2[i];
++      }
++      y = VEC_LD(tmpbuf);
++      cb = VEC_LD(tmpbuf + 16);
++      cr = VEC_LD(tmpbuf + 32);
++      CALC_RGB
++      RGB_SHUFFLE
++      VEC_ST(tmpbuf, rgb0);
++      VEC_ST(tmpbuf + 16, rgb1);
++      VEC_ST(tmpbuf + 32, rgb2);
++#if PIXELSIZE == 4
++      VEC_ST(tmpbuf + 48, rgb3);
++#endif
++      memcpy(outptr, tmpbuf, num_cols * PIXELSIZE);
++    }
++  }
++}
++
++void ycc_rgbn_merged(JDIMENSION out_width, JSAMPIMAGE input_buf,
++                     JDIMENSION in_row_group_ctr,
++                     JDIMENSION in_row_group_ctr_y,
++                     JSAMPARRAY output_buf, int shuf_idx)
++{
++  JSAMPROW outptr, inptr0, inptr1, inptr2;
++  int num_cols;
++  uint8_t __attribute__((aligned(16))) tmpbuf[4 * 16];
++
++  __m128i rgb0, rgb1, rgb2, rgb3, y, cb, cr;
++  __m128i rg0, rg1, bx0, bx1, yl, yh, cbl, cbh,
++    crl, crh, r_yl, r_yh, g_yl, g_yh, b_yl, b_yh, g_y0w, g_y1w, g_y2w, g_y3w,
++    rl, rh, gl, gh, bl, bh;
++  __m128i g_y0, g_y1, g_y2, g_y3;
++
++  /* Constants
++   * NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17
++   * high-order bits, not 16.
++   */
++  __m128i pw_f0402 = _mm_set1_epi16(F_0_402 >> 1),
++    pw_mf0228 = _mm_set1_epi16(-F_0_228 >> 1),
++    pw_mf0344_f0285 = _mm_setr_epi16(__4X2(-F_0_344, F_0_285)),
++    pb_255 = _mm_set1_epi8(-1),
++    pw_cj = _mm_set1_epi16(CENTERJSAMPLE),
++    pd_onehalf = _mm_set1_epi32(ONE_HALF),
++    pb_zero = _mm_setzero_si128();
++  RGB_SHUFFLE_INIT
++#ifdef NEED_ALIGN8
++  ALIGN8_COMMON
++  ALIGN8_VARS(src0)
++  ALIGN8_VARS(src1)
++  ALIGN8_VARS(src2)
++#endif
++
++  inptr0 = input_buf[0][in_row_group_ctr_y];
++  inptr1 = input_buf[1][in_row_group_ctr];
++  inptr2 = input_buf[2][in_row_group_ctr];
++  outptr = output_buf[0];
++
++  if (out_width >= 32) {
++#ifdef NEED_ALIGN8
++    ALIGN8_START(inptr0, src0)
++    ALIGN8_START(inptr1, src1)
++    ALIGN8_START(inptr2, src2)
++    inptr0 += out_width & -32;
++    inptr1 += (out_width & -32) >> 1;
++    inptr2 += (out_width & -32) >> 1;
++#endif
++    PRAGMA_E2K("ivdep")
++    for (num_cols = out_width; num_cols >= 32; num_cols -= 32) {
++
++#ifdef NEED_ALIGN8
++      ALIGN8_READ16(cb, src1, 0) src1_ptr += 2;
++      ALIGN8_READ16(cr, src2, 0) src2_ptr += 2;
++#else
++      cb = VEC_LD(inptr1); inptr1 += 16;
++      cr = VEC_LD(inptr2); inptr2 += 16;
++#endif
++      CALC_MERGED1
++
++#ifdef NEED_ALIGN8
++      ALIGN8_READ16(y, src0, 0) src0_ptr += 2;
++#else
++      y = VEC_LD(inptr0); inptr0 += 16;
++#endif
++      CALC_MERGED2(r_yl, g_yl, b_yl)
++      RGB_SHUFFLE
++      VEC_ST(outptr, rgb0);
++      VEC_ST(outptr + 16, rgb1);
++      VEC_ST(outptr + 32, rgb2);
++#if PIXELSIZE == 4
++      VEC_ST(outptr + 48, rgb3);
++#endif
++      outptr += PIXELSIZE * 16;
++
++#ifdef NEED_ALIGN8
++      ALIGN8_READ16(y, src0, 0) src0_ptr += 2;
++#else
++      y = VEC_LD(inptr0); inptr0 += 16;
++#endif
++      CALC_MERGED2(r_yh, g_yh, b_yh)
++      RGB_SHUFFLE
++      VEC_ST(outptr, rgb0);
++      VEC_ST(outptr + 16, rgb1);
++      VEC_ST(outptr + 32, rgb2);
++#if PIXELSIZE == 4
++      VEC_ST(outptr + 48, rgb3);
++#endif
++      outptr += PIXELSIZE * 16;
++    }
++  }
++
++  num_cols = out_width & 31;
++  if (num_cols) {
++    int i;
++    for (i = 0; i < (num_cols + 1) >> 1; i++) {
++      tmpbuf[i] = inptr1[i];
++      tmpbuf[i + 16] = inptr2[i];
++      tmpbuf[i * 2 + 32] = inptr0[i * 2];
++      tmpbuf[i * 2 + 32 + 1] = inptr0[i * 2 + 1];
++    }
++    cb = VEC_LD(tmpbuf);
++    cr = VEC_LD(tmpbuf + 16);
++    CALC_MERGED1
++
++    y = VEC_LD(tmpbuf + 32);
++    CALC_MERGED2(r_yl, g_yl, b_yl)
++    RGB_SHUFFLE
++    if (num_cols >= 16) {
++      VEC_ST(outptr, rgb0);
++      VEC_ST(outptr + 16, rgb1);
++      VEC_ST(outptr + 32, rgb2);
++#if PIXELSIZE == 4
++      VEC_ST(outptr + 48, rgb3);
++#endif
++      outptr += PIXELSIZE * 16;
++
++      y = VEC_LD(tmpbuf + 48);
++      CALC_MERGED2(r_yh, g_yh, b_yh)
++      RGB_SHUFFLE
++    }
++    VEC_ST(tmpbuf, rgb0);
++    VEC_ST(tmpbuf + 16, rgb1);
++    VEC_ST(tmpbuf + 32, rgb2);
++#if PIXELSIZE == 4
++    VEC_ST(tmpbuf + 48, rgb3);
++#endif
++    memcpy(outptr, tmpbuf, (out_width & 15) * PIXELSIZE);
++  }
++}
++
++#undef RGB_SHUFFLE_INIT
++#undef RGB_SHUFFLE
++#undef PIXELSIZE
++#undef ycc_rgbn_convert
++#undef ycc_rgbn_merged
++
+diff --git a/simd/e2k/jdcolor-e2k.c b/simd/e2k/jdcolor-e2k.c
+new file mode 100644
+index 0000000..94c80e9
+--- /dev/null
++++ b/simd/e2k/jdcolor-e2k.c
+@@ -0,0 +1,289 @@
++/*
++ * Elbrus optimizations for libjpeg-turbo
++ *
++ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
++ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
++ *
++ * This software is provided 'as-is', without any express or implied
++ * warranty.  In no event will the authors be held liable for any damages
++ * arising from the use of this software.
++ *
++ * Permission is granted to anyone to use this software for any purpose,
++ * including commercial applications, and to alter it and redistribute it
++ * freely, subject to the following restrictions:
++ *
++ * 1. The origin of this software must not be misrepresented; you must not
++ *    claim that you wrote the original software. If you use this software
++ *    in a product, an acknowledgment in the product documentation would be
++ *    appreciated but is not required.
++ * 2. Altered source versions must be plainly marked as such, and must not be
++ *    misrepresented as being the original software.
++ * 3. This notice may not be removed or altered from any source distribution.
++ */
++
++/* YCC --> RGB CONVERSION */
++
++#include "jsimd_e2k.h"
++
++#define F_0_344  22554              /* FIX(0.34414) */
++#define F_0_714  46802              /* FIX(0.71414) */
++#define F_1_402  91881              /* FIX(1.40200) */
++#define F_1_772  116130             /* FIX(1.77200) */
++#define F_0_402  (F_1_402 - 65536)  /* FIX(1.40200) - FIX(1) */
++#define F_0_285  (65536 - F_0_714)  /* FIX(1) - FIX(0.71414) */
++#define F_0_228  (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */
++
++#define SCALEBITS  16
++#define ONE_HALF  (1 << (SCALEBITS - 1))
++
++static const uint8_t __attribute__((aligned(16)))
++#if defined(__iset__) && __iset__ >= 5
++ycc_rgb_shuf_const[7][48] = {
++#define SHUF_CONST3          \
++  C0,      C1,      C2,      \
++  C0 + 4,  C1 + 4,  C2 + 4,  \
++  C0 + 8,  C1 + 8,  C2 + 8,  \
++  C0 + 12, C1 + 12, C2 + 12, \
++  C0 + 16, C1 + 16, C2 + 16, \
++  C0 + 20, C1 + 4,  C2 + 4,  \
++  C0 + 8,  C1 + 8,  C2 + 8,  \
++  C0 + 12, C1 + 12, C2 + 12, \
++  C0 + 16, C1 + 16, C2 + 16, \
++  C0 + 20, C1 + 20, C2 + 20, \
++  C0 + 24, C1 + 24, C2 + 8,  \
++  C0 + 12, C1 + 12, C2 + 12, \
++  C0 + 16, C1 + 16, C2 + 16, \
++  C0 + 20, C1 + 20, C2 + 20, \
++  C0 + 24, C1 + 24, C2 + 24, \
++  C0 + 28, C1 + 28, C2 + 28
++#else
++ycc_rgb_shuf_const[7][24] = {
++#define SHUF_CONST3          \
++  C0,      C1,      C2,      \
++  C0 + 4,  C1 + 4,  C2 + 4,  \
++  C0 + 8,  C1 + 8,  C2,      \
++  C0 + 4,  C1 + 4,  C2 + 4,  \
++  C0 + 8,  C1 + 8,  C2 + 8,  \
++  C0 + 12, C1 + 4,  C2 + 4,  \
++  C0 + 8,  C1 + 8,  C2 + 8,  \
++  C0 + 12, C1 + 12, C2 + 12
++#endif
++
++#define SHUF_CONST4 C0, C1, C2, C3, C0 + 4, C1 + 4, C2 + 4, C3 + 4, \
++  C0 + 8, C1 + 8, C2 + 8, C3 + 8, C0 + 12, C1 + 12, C2 + 12, C3 + 12
++
++#define TMP_RED    RGB_RED
++#define TMP_GREEN  RGB_GREEN
++#define TMP_BLUE   RGB_BLUE
++#define PIXELSIZE  RGB_PIXELSIZE
++#include "jdcoltab-e2k.c"
++  ,
++#define TMP_RED    EXT_RGB_RED
++#define TMP_GREEN  EXT_RGB_GREEN
++#define TMP_BLUE   EXT_RGB_BLUE
++#define PIXELSIZE  EXT_RGB_PIXELSIZE
++#include "jdcoltab-e2k.c"
++  ,
++#define TMP_RED    EXT_RGBX_RED
++#define TMP_GREEN  EXT_RGBX_GREEN
++#define TMP_BLUE   EXT_RGBX_BLUE
++#define PIXELSIZE  EXT_RGBX_PIXELSIZE
++#include "jdcoltab-e2k.c"
++  ,
++#define TMP_RED    EXT_BGR_RED
++#define TMP_GREEN  EXT_BGR_GREEN
++#define TMP_BLUE   EXT_BGR_BLUE
++#define PIXELSIZE  EXT_BGR_PIXELSIZE
++#include "jdcoltab-e2k.c"
++  ,
++#define TMP_RED    EXT_BGRX_RED
++#define TMP_GREEN  EXT_BGRX_GREEN
++#define TMP_BLUE   EXT_BGRX_BLUE
++#define PIXELSIZE  EXT_BGRX_PIXELSIZE
++#include "jdcoltab-e2k.c"
++  ,
++#define TMP_RED    EXT_XBGR_RED
++#define TMP_GREEN  EXT_XBGR_GREEN
++#define TMP_BLUE   EXT_XBGR_BLUE
++#define PIXELSIZE  EXT_XBGR_PIXELSIZE
++#include "jdcoltab-e2k.c"
++  ,
++#define TMP_RED    EXT_XRGB_RED
++#define TMP_GREEN  EXT_XRGB_GREEN
++#define TMP_BLUE   EXT_XRGB_BLUE
++#define PIXELSIZE  EXT_XRGB_PIXELSIZE
++#include "jdcoltab-e2k.c"
++};
++
++  /* (Original)
++   * R = Y                + 1.40200 * Cr
++   * G = Y - 0.34414 * Cb - 0.71414 * Cr
++   * B = Y + 1.77200 * Cb
++   *
++   * (This implementation)
++   * R = Y                + 0.40200 * Cr + Cr
++   * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
++   * B = Y - 0.22800 * Cb + Cb + Cb
++   */
++
++#define CALC_RGB \
++  yl = _mm_unpacklo_epi8(y, pb_zero); \
++  yh = _mm_unpackhi_epi8(y, pb_zero); \
++  \
++  cbl = _mm_unpacklo_epi8(cb, pb_zero); \
++  cbh = _mm_unpackhi_epi8(cb, pb_zero); \
++  cbl = _mm_sub_epi16(cbl, pw_cj); \
++  cbh = _mm_sub_epi16(cbh, pw_cj); \
++  \
++  crl = _mm_unpacklo_epi8(cr, pb_zero); \
++  crh = _mm_unpackhi_epi8(cr, pb_zero); \
++  crl = _mm_sub_epi16(crl, pw_cj); \
++  crh = _mm_sub_epi16(crh, pw_cj); \
++  \
++  bl = _mm_mulhrs_epi16(cbl, pw_mf0228); \
++  bh = _mm_mulhrs_epi16(cbh, pw_mf0228); \
++  bl = _mm_add_epi16(bl, _mm_add_epi16(cbl, cbl)); \
++  bh = _mm_add_epi16(bh, _mm_add_epi16(cbh, cbh)); \
++  bl = _mm_add_epi16(bl, yl); \
++  bh = _mm_add_epi16(bh, yh); \
++  \
++  rl = _mm_mulhrs_epi16(crl, pw_f0402); \
++  rh = _mm_mulhrs_epi16(crh, pw_f0402); \
++  rl = _mm_add_epi16(rl, crl); \
++  rh = _mm_add_epi16(rh, crh); \
++  rl = _mm_add_epi16(rl, yl); \
++  rh = _mm_add_epi16(rh, yh); \
++  \
++  g0w = _mm_unpacklo_epi16(cbl, crl); \
++  g1w = _mm_unpackhi_epi16(cbl, crl); \
++  g0 = _mm_add_epi32(_mm_madd_epi16(g0w, pw_mf0344_f0285), pd_onehalf); \
++  g1 = _mm_add_epi32(_mm_madd_epi16(g1w, pw_mf0344_f0285), pd_onehalf); \
++  g2w = _mm_unpacklo_epi16(cbh, crh); \
++  g3w = _mm_unpackhi_epi16(cbh, crh); \
++  g2 = _mm_add_epi32(_mm_madd_epi16(g2w, pw_mf0344_f0285), pd_onehalf); \
++  g3 = _mm_add_epi32(_mm_madd_epi16(g3w, pw_mf0344_f0285), pd_onehalf); \
++  \
++  gl = _mm_packhi_epi32(g0, g1); \
++  gh = _mm_packhi_epi32(g2, g3); \
++  gl = _mm_sub_epi16(gl, crl); \
++  gh = _mm_sub_epi16(gh, crh); \
++  gl = _mm_add_epi16(gl, yl); \
++  gh = _mm_add_epi16(gh, yh); \
++  \
++  rl = _mm_packus_epi16(rl, rh); \
++  gl = _mm_packus_epi16(gl, gh); \
++  bl = _mm_packus_epi16(bl, bh); \
++  \
++  rg0 = _mm_unpacklo_epi8(rl, gl); \
++  rg1 = _mm_unpackhi_epi8(rl, gl); \
++  bx0 = _mm_unpacklo_epi8(bl, pb_255); \
++  bx1 = _mm_unpackhi_epi8(bl, pb_255); \
++  \
++  rgb0 = _mm_unpacklo_epi16(rg0, bx0); \
++  rgb1 = _mm_unpackhi_epi16(rg0, bx0); \
++  rgb2 = _mm_unpacklo_epi16(rg1, bx1); \
++  rgb3 = _mm_unpackhi_epi16(rg1, bx1);
++
++#define CALC_MERGED1 \
++  cbl = _mm_unpacklo_epi8(cb, pb_zero); \
++  cbh = _mm_unpackhi_epi8(cb, pb_zero); \
++  cbl = _mm_sub_epi16(cbl, pw_cj); \
++  cbh = _mm_sub_epi16(cbh, pw_cj); \
++  \
++  crl = _mm_unpacklo_epi8(cr, pb_zero); \
++  crh = _mm_unpackhi_epi8(cr, pb_zero); \
++  crl = _mm_sub_epi16(crl, pw_cj); \
++  crh = _mm_sub_epi16(crh, pw_cj); \
++  \
++  b_yl = _mm_mulhrs_epi16(cbl, pw_mf0228); \
++  b_yh = _mm_mulhrs_epi16(cbh, pw_mf0228); \
++  b_yl = _mm_add_epi16(b_yl, _mm_add_epi16(cbl, cbl)); \
++  b_yh = _mm_add_epi16(b_yh, _mm_add_epi16(cbh, cbh)); \
++  \
++  r_yl = _mm_mulhrs_epi16(crl, pw_f0402); \
++  r_yh = _mm_mulhrs_epi16(crh, pw_f0402); \
++  r_yl = _mm_add_epi16(r_yl, crl); \
++  r_yh = _mm_add_epi16(r_yh, crh); \
++  \
++  g_y0w = _mm_unpacklo_epi16(cbl, crl); \
++  g_y1w = _mm_unpackhi_epi16(cbl, crl); \
++  g_y0 = _mm_add_epi32(_mm_madd_epi16(g_y0w, pw_mf0344_f0285), pd_onehalf); \
++  g_y1 = _mm_add_epi32(_mm_madd_epi16(g_y1w, pw_mf0344_f0285), pd_onehalf); \
++  g_y2w = _mm_unpacklo_epi16(cbh, crh); \
++  g_y3w = _mm_unpackhi_epi16(cbh, crh); \
++  g_y2 = _mm_add_epi32(_mm_madd_epi16(g_y2w, pw_mf0344_f0285), pd_onehalf); \
++  g_y3 = _mm_add_epi32(_mm_madd_epi16(g_y3w, pw_mf0344_f0285), pd_onehalf); \
++  \
++  g_yl = _mm_packhi_epi32(g_y0, g_y1); \
++  g_yh = _mm_packhi_epi32(g_y2, g_y3); \
++  g_yl = _mm_sub_epi16(g_yl, crl); \
++  g_yh = _mm_sub_epi16(g_yh, crh);
++
++#define CALC_MERGED2(r_yl, g_yl, b_yl) \
++  yl = _mm_unpacklo_epi8(y, pb_zero); \
++  yh = _mm_unpackhi_epi8(y, pb_zero); \
++  bl = _mm_add_epi16(_mm_unpacklo_epi16(b_yl, b_yl), yl); \
++  bh = _mm_add_epi16(_mm_unpackhi_epi16(b_yl, b_yl), yh); \
++  rl = _mm_add_epi16(_mm_unpacklo_epi16(r_yl, r_yl), yl); \
++  rh = _mm_add_epi16(_mm_unpackhi_epi16(r_yl, r_yl), yh); \
++  gl = _mm_add_epi16(_mm_unpacklo_epi16(g_yl, g_yl), yl); \
++  gh = _mm_add_epi16(_mm_unpackhi_epi16(g_yl, g_yl), yh); \
++  rl = _mm_packus_epi16(rl, rh); \
++  gl = _mm_packus_epi16(gl, gh); \
++  bl = _mm_packus_epi16(bl, bh); \
++  \
++  rg0 = _mm_unpacklo_epi8(rl, gl); \
++  rg1 = _mm_unpackhi_epi8(rl, gl); \
++  bx0 = _mm_unpacklo_epi8(bl, pb_255); \
++  bx1 = _mm_unpackhi_epi8(bl, pb_255); \
++  \
++  rgb0 = _mm_unpacklo_epi16(rg0, bx0); \
++  rgb1 = _mm_unpackhi_epi16(rg0, bx0); \
++  rgb2 = _mm_unpacklo_epi16(rg1, bx1); \
++  rgb3 = _mm_unpackhi_epi16(rg1, bx1);
++
++#define PIXELSIZE 3
++#if defined(__iset__) && __iset__ >= 5
++#define RGB_SHUFFLE_INIT __m128i \
++  rgb_index0 = VEC_LD(ycc_rgb_shuf_const[shuf_idx]), \
++  rgb_index1 = VEC_LD(ycc_rgb_shuf_const[shuf_idx] + 16), \
++  rgb_index2 = VEC_LD(ycc_rgb_shuf_const[shuf_idx] + 32);
++#define RGB_SHUFFLE \
++  rgb0 = _mm_shuffle2_epi8(rgb0, rgb1, rgb_index0); \
++  rgb1 = _mm_shuffle2_epi8(rgb1, rgb2, rgb_index1); \
++  rgb2 = _mm_shuffle2_epi8(rgb2, rgb3, rgb_index2);
++#else
++#define RGB_SHUFFLE_INIT __m64 \
++  rgb_index0 = *(__m64*)ycc_rgb_shuf_const[shuf_idx], \
++  rgb_index1 = *(__m64*)(ycc_rgb_shuf_const[shuf_idx] + 8), \
++  rgb_index2 = *(__m64*)(ycc_rgb_shuf_const[shuf_idx] + 16);
++#define RGB_SHUFFLE { \
++  union { __m128i v; __m64 d[2]; } a = { rgb0 }, \
++    b = { rgb1 }, c = { rgb2 }, d = { rgb3 }; \
++  a.d[0] = _mm_shuffle2_pi8(a.d[0], a.d[1], rgb_index0); \
++  a.d[1] = _mm_shuffle2_pi8(a.d[1], b.d[0], rgb_index1); \
++  b.d[0] = _mm_shuffle2_pi8(b.d[0], b.d[1], rgb_index2); \
++  b.d[1] = _mm_shuffle2_pi8(c.d[0], c.d[1], rgb_index0); \
++  c.d[0] = _mm_shuffle2_pi8(c.d[1], d.d[0], rgb_index1); \
++  c.d[1] = _mm_shuffle2_pi8(d.d[0], d.d[1], rgb_index2); \
++  rgb0 = a.v; rgb1 = b.v; rgb2 = c.v; \
++}
++#endif
++
++#define ycc_rgbn_convert jsimd_ycc_rgb3_convert_e2k
++#define ycc_rgbn_merged  jsimd_ycc_rgb3_merged_upsample_e2k
++#include "jdcolext-e2k.c"
++
++#define PIXELSIZE 4
++#define RGB_SHUFFLE_INIT __m128i \
++  rgb_index0 = VEC_LD(ycc_rgb_shuf_const[shuf_idx]);
++#define RGB_SHUFFLE \
++  rgb0 = _mm_shuffle_epi8(rgb0, rgb_index0); \
++  rgb1 = _mm_shuffle_epi8(rgb1, rgb_index0); \
++  rgb2 = _mm_shuffle_epi8(rgb2, rgb_index0); \
++  rgb3 = _mm_shuffle_epi8(rgb3, rgb_index0);
++
++#define ycc_rgbn_convert jsimd_ycc_rgb4_convert_e2k
++#define ycc_rgbn_merged  jsimd_ycc_rgb4_merged_upsample_e2k
++#include "jdcolext-e2k.c"
++
+diff --git a/simd/e2k/jdcoltab-e2k.c b/simd/e2k/jdcoltab-e2k.c
+new file mode 100644
+index 0000000..e19666d
+--- /dev/null
++++ b/simd/e2k/jdcoltab-e2k.c
+@@ -0,0 +1,80 @@
++/*
++ * Elbrus optimizations for libjpeg-turbo
++ *
++ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
++ *
++ * This software is provided 'as-is', without any express or implied
++ * warranty.  In no event will the authors be held liable for any damages
++ * arising from the use of this software.
++ *
++ * Permission is granted to anyone to use this software for any purpose,
++ * including commercial applications, and to alter it and redistribute it
++ * freely, subject to the following restrictions:
++ *
++ * 1. The origin of this software must not be misrepresented; you must not
++ *    claim that you wrote the original software. If you use this software
++ *    in a product, an acknowledgment in the product documentation would be
++ *    appreciated but is not required.
++ * 2. Altered source versions must be plainly marked as such, and must not be
++ *    misrepresented as being the original software.
++ * 3. This notice may not be removed or altered from any source distribution.
++ */
++
++/* This file is included by jdcolor-e2k.c */
++
++#if TMP_RED == 0
++#define C0 0
++#elif TMP_GREEN == 0
++#define C0 1
++#elif TMP_BLUE == 0
++#define C0 2
++#else
++#define C0 3
++#endif
++
++#if TMP_RED == 1
++#define C1 0
++#elif TMP_GREEN == 1
++#define C1 1
++#elif TMP_BLUE == 1
++#define C1 2
++#else
++#define C1 3
++#endif
++
++#if TMP_RED == 2
++#define C2 0
++#elif TMP_GREEN == 2
++#define C2 1
++#elif TMP_BLUE == 2
++#define C2 2
++#else
++#define C2 3
++#endif
++
++#if TMP_RED == 3
++#define C3 0
++#elif TMP_GREEN == 3
++#define C3 1
++#elif TMP_BLUE == 3
++#define C3 2
++#else
++#define C3 3
++#endif
++
++#if PIXELSIZE == 3
++{ SHUF_CONST3 }
++#else
++{ SHUF_CONST4 }
++#endif
++
++#undef C0
++#undef C1
++#undef C2
++#undef C3
++
++#undef TMP_RED
++#undef TMP_GREEN
++#undef TMP_BLUE
++#undef PIXELSIZE
++
+diff --git a/simd/e2k/jdsample-e2k.c b/simd/e2k/jdsample-e2k.c
+new file mode 100644
+index 0000000..572b3af
+--- /dev/null
++++ b/simd/e2k/jdsample-e2k.c
+@@ -0,0 +1,389 @@
++/*
++ * Elbrus optimizations for libjpeg-turbo
++ *
++ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
++ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
++ *
++ * This software is provided 'as-is', without any express or implied
++ * warranty.  In no event will the authors be held liable for any damages
++ * arising from the use of this software.
++ *
++ * Permission is granted to anyone to use this software for any purpose,
++ * including commercial applications, and to alter it and redistribute it
++ * freely, subject to the following restrictions:
++ *
++ * 1. The origin of this software must not be misrepresented; you must not
++ *    claim that you wrote the original software. If you use this software
++ *    in a product, an acknowledgment in the product documentation would be
++ *    appreciated but is not required.
++ * 2. Altered source versions must be plainly marked as such, and must not be
++ *    misrepresented as being the original software.
++ * 3. This notice may not be removed or altered from any source distribution.
++ */
++
++/* CHROMA UPSAMPLING */
++
++#include "jsimd_e2k.h"
++
++
++void jsimd_h2v1_fancy_upsample_e2k(int max_v_samp_factor,
++                                   JDIMENSION downsampled_width,
++                                   JSAMPARRAY input_data,
++                                   JSAMPARRAY *output_data_ptr)
++{
++  JSAMPARRAY output_data = *output_data_ptr;
++  JSAMPROW inptr, outptr;
++  int inrow, incol;
++
++  __m128i pb_zero = _mm_setzero_si128();
++  __m128i this0, last0, p_last0, next0 = pb_zero, p_next0, out;
++  __m128i this0l, this0h, last0l, last0h,
++    next0l, next0h, outle, outhe, outlo, outho;
++#ifdef NEED_ALIGN8
++  ALIGN8_COMMON
++  ALIGN8_VARS(src)
++#endif
++
++  /* Constants */
++  __m128i pw_three = _mm_set1_epi16(3),
++    next_index_lastcol = _mm_setr_epi8(
++       1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15),
++    pw_one = _mm_set1_epi16(1), pw_two = _mm_set1_epi16(2);
++
++  if (downsampled_width > 0)
++  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
++    inptr = input_data[inrow];
++    outptr = output_data[inrow];
++
++    if (downsampled_width & 15)
++      inptr[downsampled_width] = inptr[downsampled_width - 1];
++
++#ifdef NEED_ALIGN8
++    ALIGN8_START(inptr, src)
++    ALIGN8_READ16(this0, src, 0)
++#else
++    this0 = VEC_LD(inptr);
++#endif
++    last0 = _mm_bslli_si128(this0, 15);
++
++    PRAGMA_E2K("ivdep")
++    for (incol = downsampled_width; incol > 0;
++         incol -= 16, outptr += 32) {
++
++      p_last0 = _mm_alignr_epi8(this0, last0, 15);
++      last0 = this0;
++
++      if (__builtin_expect(incol <= 16, 0))
++        p_next0 = _mm_shuffle_epi8(this0, next_index_lastcol);
++      else {
++#ifdef NEED_ALIGN8
++        ALIGN8_READ16(next0, src, 1) src_ptr += 2;
++#else
++        next0 = VEC_LD(inptr + 16); inptr += 16;
++#endif
++        p_next0 = _mm_alignr_epi8(next0, this0, 1);
++      }
++
++      this0l = _mm_mullo_epi16(_mm_unpacklo_epi8(this0, pb_zero), pw_three);
++      last0l = _mm_unpacklo_epi8(p_last0, pb_zero);
++      next0l = _mm_unpacklo_epi8(p_next0, pb_zero);
++      last0l = _mm_add_epi16(last0l, pw_one);
++      next0l = _mm_add_epi16(next0l, pw_two);
++
++      outle = _mm_add_epi16(this0l, last0l);
++      outlo = _mm_add_epi16(this0l, next0l);
++      outle = _mm_srli_epi16(outle, 2);
++      outlo = _mm_srli_epi16(outlo, 2);
++
++      out = _mm_or_si128(outle, _mm_slli_epi16(outlo, 8));
++      VEC_ST(outptr, out);
++
++      if (__builtin_expect(incol <= 8, 0)) break;
++
++      this0h = _mm_mullo_epi16(_mm_unpackhi_epi8(this0, pb_zero), pw_three);
++      last0h = _mm_unpackhi_epi8(p_last0, pb_zero);
++      next0h = _mm_unpackhi_epi8(p_next0, pb_zero);
++      last0h = _mm_add_epi16(last0h, pw_one);
++      next0h = _mm_add_epi16(next0h, pw_two);
++
++      outhe = _mm_add_epi16(this0h, last0h);
++      outho = _mm_add_epi16(this0h, next0h);
++      outhe = _mm_srli_epi16(outhe, 2);
++      outho = _mm_srli_epi16(outho, 2);
++
++      out = _mm_or_si128(outhe, _mm_slli_epi16(outho, 8));
++      VEC_ST(outptr + 16, out);
++
++      this0 = next0;
++    }
++  }
++}
++
++
++void jsimd_h2v2_fancy_upsample_e2k(int max_v_samp_factor,
++                                   JDIMENSION downsampled_width,
++                                   JSAMPARRAY input_data,
++                                   JSAMPARRAY *output_data_ptr)
++{
++  JSAMPARRAY output_data = *output_data_ptr;
++  JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
++  int inrow, outrow, incol;
++#ifdef NEED_ALIGN8
++  ALIGN8_COMMON
++  ALIGN8_VARS(src_1)
++  ALIGN8_VARS(src0)
++  ALIGN8_VARS(src1)
++#endif
++
++  __m128i pb_zero = _mm_setzero_si128();
++  __m128i this_1, this0, this1, out;
++  __m128i this_1l, this_1h, this0l, this0h, this1l, this1h,
++    lastcolsum_1h, lastcolsum1h,
++    p_lastcolsum_1l, p_lastcolsum_1h, p_lastcolsum1l, p_lastcolsum1h,
++    thiscolsum_1l, thiscolsum_1h, thiscolsum1l, thiscolsum1h,
++    nextcolsum_1l = pb_zero, nextcolsum_1h = pb_zero,
++    nextcolsum1l = pb_zero, nextcolsum1h = pb_zero,
++    p_nextcolsum_1l, p_nextcolsum_1h, p_nextcolsum1l, p_nextcolsum1h,
++    tmpl, tmph, outle, outhe, outlo, outho;
++
++  /* Constants */
++  __m128i pw_three = _mm_set1_epi16(3),
++    pw_seven = _mm_set1_epi16(7), pw_eight = _mm_set1_epi16(8),
++    next_index_lastcol = _mm_setr_epi8(
++       2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 14, 15);
++
++  if (downsampled_width > 0)
++  for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
++
++    inptr_1 = input_data[inrow - 1];
++    inptr0 = input_data[inrow];
++    inptr1 = input_data[inrow + 1];
++    outptr0 = output_data[outrow++];
++    outptr1 = output_data[outrow++];
++
++    if (downsampled_width & 15) {
++      inptr_1[downsampled_width] = inptr_1[downsampled_width - 1];
++      inptr0[downsampled_width] = inptr0[downsampled_width - 1];
++      inptr1[downsampled_width] = inptr1[downsampled_width - 1];
++    }
++
++#ifdef NEED_ALIGN8
++    ALIGN8_START(inptr0, src0)
++    ALIGN8_START(inptr_1, src_1)
++    ALIGN8_START(inptr1, src1)
++    ALIGN8_READ16(this0, src0, 0)
++    ALIGN8_READ16(this_1, src_1, 0)
++    ALIGN8_READ16(this1, src1, 0)
++#else
++    this0 = VEC_LD(inptr0);
++    this_1 = VEC_LD(inptr_1);
++    this1 = VEC_LD(inptr1);
++#endif
++
++    this0l = _mm_unpacklo_epi8(this0, pb_zero);
++    this0h = _mm_unpackhi_epi8(this0, pb_zero);
++    this0l = _mm_mullo_epi16(this0l, pw_three);
++    this0h = _mm_mullo_epi16(this0h, pw_three);
++
++    this_1l = _mm_unpacklo_epi8(this_1, pb_zero);
++    this_1h = _mm_unpackhi_epi8(this_1, pb_zero);
++    thiscolsum_1l = _mm_add_epi16(this0l, this_1l);
++    thiscolsum_1h = _mm_add_epi16(this0h, this_1h);
++    lastcolsum_1h = _mm_bslli_si128(thiscolsum_1l, 14);;
++
++    this1l = _mm_unpacklo_epi8(this1, pb_zero);
++    this1h = _mm_unpackhi_epi8(this1, pb_zero);
++    thiscolsum1l = _mm_add_epi16(this0l, this1l);
++    thiscolsum1h = _mm_add_epi16(this0h, this1h);
++    lastcolsum1h = _mm_bslli_si128(thiscolsum1l, 14);
++
++    PRAGMA_E2K("ivdep")
++    for (incol = downsampled_width; incol > 0;
++         incol -= 16, outptr0 += 32, outptr1 += 32) {
++
++      p_lastcolsum_1l = _mm_alignr_epi8(thiscolsum_1l, lastcolsum_1h, 14);
++      p_lastcolsum_1h = _mm_alignr_epi8(thiscolsum_1h, thiscolsum_1l, 14);
++      p_lastcolsum1l = _mm_alignr_epi8(thiscolsum1l, lastcolsum1h, 14);
++      p_lastcolsum1h = _mm_alignr_epi8(thiscolsum1h, thiscolsum1l, 14);
++      lastcolsum_1h = thiscolsum_1h;
++      lastcolsum1h = thiscolsum1h;
++
++      if (__builtin_expect(incol <= 16, 0)) {
++        p_nextcolsum_1l = _mm_alignr_epi8(thiscolsum_1h, thiscolsum_1l, 2);
++        p_nextcolsum_1h = _mm_shuffle_epi8(thiscolsum_1h, next_index_lastcol);
++        p_nextcolsum1l = _mm_alignr_epi8(thiscolsum1h, thiscolsum1l, 2);
++        p_nextcolsum1h = _mm_shuffle_epi8(thiscolsum1h, next_index_lastcol);
++      } else {
++#ifdef NEED_ALIGN8
++        ALIGN8_READ16(this0, src0, 1) src0_ptr += 2;
++        ALIGN8_READ16(this_1, src_1, 1) src_1_ptr += 2;
++        ALIGN8_READ16(this1, src1, 1) src1_ptr += 2;
++#else
++        this0 = VEC_LD(inptr0 + 16); inptr0 += 16;
++        this_1 = VEC_LD(inptr_1 + 16); inptr_1 += 16;
++        this1 = VEC_LD(inptr1 + 16); inptr1 += 16;
++#endif
++        this0l = _mm_unpacklo_epi8(this0, pb_zero);
++        this0h = _mm_unpackhi_epi8(this0, pb_zero);
++        this0l = _mm_mullo_epi16(this0l, pw_three);
++        this0h = _mm_mullo_epi16(this0h, pw_three);
++
++        this_1l = _mm_unpacklo_epi8(this_1, pb_zero);
++        this_1h = _mm_unpackhi_epi8(this_1, pb_zero);
++        nextcolsum_1l = _mm_add_epi16(this0l, this_1l);
++        nextcolsum_1h = _mm_add_epi16(this0h, this_1h);
++        p_nextcolsum_1l = _mm_alignr_epi8(thiscolsum_1h, thiscolsum_1l, 2);
++        p_nextcolsum_1h = _mm_alignr_epi8(nextcolsum_1l, thiscolsum_1h, 2);
++
++        this1l = _mm_unpacklo_epi8(this1, pb_zero);
++        this1h = _mm_unpackhi_epi8(this1, pb_zero);
++        nextcolsum1l = _mm_add_epi16(this0l, this1l);
++        nextcolsum1h = _mm_add_epi16(this0h, this1h);
++        p_nextcolsum1l = _mm_alignr_epi8(thiscolsum1h, thiscolsum1l, 2);
++        p_nextcolsum1h = _mm_alignr_epi8(nextcolsum1l, thiscolsum1h, 2);
++      }
++
++      /* Process the upper row */
++
++      tmpl = _mm_mullo_epi16(thiscolsum_1l, pw_three);
++      outle = _mm_add_epi16(tmpl, p_lastcolsum_1l);
++      outle = _mm_add_epi16(outle, pw_eight);
++      outle = _mm_srli_epi16(outle, 4);
++
++      outlo = _mm_add_epi16(tmpl, p_nextcolsum_1l);
++      outlo = _mm_add_epi16(outlo, pw_seven);
++      outlo = _mm_srli_epi16(outlo, 4);
++
++      out = _mm_or_si128(outle, _mm_slli_epi16(outlo, 8));
++      VEC_ST(outptr0, out);
++
++      /* Process the lower row */
++
++      tmpl = _mm_mullo_epi16(thiscolsum1l, pw_three);
++      outle = _mm_add_epi16(tmpl, p_lastcolsum1l);
++      outle = _mm_add_epi16(outle, pw_eight);
++      outle = _mm_srli_epi16(outle, 4);
++
++      outlo = _mm_add_epi16(tmpl, p_nextcolsum1l);
++      outlo = _mm_add_epi16(outlo, pw_seven);
++      outlo = _mm_srli_epi16(outlo, 4);
++
++      out = _mm_or_si128(outle, _mm_slli_epi16(outlo, 8));
++      VEC_ST(outptr1, out);
++
++      if (__builtin_expect(incol <= 8, 0)) break;
++
++      tmph = _mm_mullo_epi16(thiscolsum_1h, pw_three);
++      outhe = _mm_add_epi16(tmph, p_lastcolsum_1h);
++      outhe = _mm_add_epi16(outhe, pw_eight);
++      outhe = _mm_srli_epi16(outhe, 4);
++
++      outho = _mm_add_epi16(tmph, p_nextcolsum_1h);
++      outho = _mm_add_epi16(outho, pw_seven);
++      outho = _mm_srli_epi16(outho, 4);
++
++      out = _mm_or_si128(outhe, _mm_slli_epi16(outho, 8));
++      VEC_ST(outptr0 + 16, out);
++
++      tmph = _mm_mullo_epi16(thiscolsum1h, pw_three);
++      outhe = _mm_add_epi16(tmph, p_lastcolsum1h);
++      outhe = _mm_add_epi16(outhe, pw_eight);
++      outhe = _mm_srli_epi16(outhe, 4);
++
++      outho = _mm_add_epi16(tmph, p_nextcolsum1h);
++      outho = _mm_add_epi16(outho, pw_seven);
++      outho = _mm_srli_epi16(outho, 4);
++
++      out = _mm_or_si128(outhe, _mm_slli_epi16(outho, 8));
++      VEC_ST(outptr1 + 16, out);
++
++      thiscolsum_1l = nextcolsum_1l;  thiscolsum_1h = nextcolsum_1h;
++      thiscolsum1l = nextcolsum1l;  thiscolsum1h = nextcolsum1h;
++    }
++  }
++}
++
++
++/* These are rarely used (mainly just for decompressing YCCK images) */
++
++void jsimd_h2v1_upsample_e2k(int max_v_samp_factor,
++                             JDIMENSION out_width,
++                             JSAMPARRAY input_data,
++                             JSAMPARRAY *output_data_ptr)
++{
++  JSAMPARRAY output_data = *output_data_ptr;
++  JSAMPROW inptr, outptr;
++  int inrow, incol;
++
++  __m128i in, inl, inh;
++#ifdef NEED_ALIGN8
++  ALIGN8_COMMON
++  ALIGN8_VARS(src)
++#endif
++
++  if (out_width > 0)
++  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
++    inptr = input_data[inrow];
++    outptr = output_data[inrow];
++#ifdef NEED_ALIGN8
++    ALIGN8_START(inptr, src)
++#endif
++    PRAGMA_E2K("ivdep")
++    for (incol = out_width; incol > 0;
++         incol -= 32, outptr += 32) {
++#ifdef NEED_ALIGN8
++      ALIGN8_READ16(in, src, 0) src_ptr += 2;
++#else
++      in = VEC_LD(inptr); inptr += 16;
++#endif
++      inl = _mm_unpacklo_epi8(in, in);
++      inh = _mm_unpackhi_epi8(in, in);
++
++      VEC_ST(outptr, inl);
++      VEC_ST(outptr + 16, inh);
++    }
++  }
++}
++
++
++void jsimd_h2v2_upsample_e2k(int max_v_samp_factor,
++                             JDIMENSION out_width,
++                             JSAMPARRAY input_data,
++                             JSAMPARRAY *output_data_ptr)
++{
++  JSAMPARRAY output_data = *output_data_ptr;
++  JSAMPROW inptr, outptr0, outptr1;
++  int inrow, outrow, incol;
++
++  __m128i in, inl, inh;
++#ifdef NEED_ALIGN8
++  ALIGN8_COMMON
++  ALIGN8_VARS(src)
++#endif
++
++  if (out_width > 0)
++  for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
++    inptr = input_data[inrow];
++    outptr0 = output_data[outrow++];
++    outptr1 = output_data[outrow++];
++#ifdef NEED_ALIGN8
++    ALIGN8_START(inptr, src)
++#endif
++    PRAGMA_E2K("ivdep")
++    for (incol = out_width; incol > 0;
++         incol -= 32, outptr0 += 32, outptr1 += 32) {
++#ifdef NEED_ALIGN8
++      ALIGN8_READ16(in, src, 0) src_ptr += 2;
++#else
++      in = VEC_LD(inptr); inptr += 16;
++#endif
++      inl = _mm_unpacklo_epi8(in, in);
++      inh = _mm_unpackhi_epi8(in, in);
++
++      VEC_ST(outptr0, inl);
++      VEC_ST(outptr1, inl);
++      VEC_ST(outptr0 + 16, inh);
++      VEC_ST(outptr1 + 16, inh);
++    }
++  }
++}
+diff --git a/simd/e2k/jfdctflt-e2k.c b/simd/e2k/jfdctflt-e2k.c
+new file mode 100644
+index 0000000..e3c4d94
+--- /dev/null
++++ b/simd/e2k/jfdctflt-e2k.c
+@@ -0,0 +1,127 @@
++/*
++ * Elbrus optimizations for libjpeg-turbo
++ *
++ * Copyright (C) 2014, D. R. Commander.  All Rights Reserved.
++ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
++ *
++ * This software is provided 'as-is', without any express or implied
++ * warranty.  In no event will the authors be held liable for any damages
++ * arising from the use of this software.
++ *
++ * Permission is granted to anyone to use this software for any purpose,
++ * including commercial applications, and to alter it and redistribute it
++ * freely, subject to the following restrictions:
++ *
++ * 1. The origin of this software must not be misrepresented; you must not
++ *    claim that you wrote the original software. If you use this software
++ *    in a product, an acknowledgment in the product documentation would be
++ *    appreciated but is not required.
++ * 2. Altered source versions must be plainly marked as such, and must not be
++ *    misrepresented as being the original software.
++ * 3. This notice may not be removed or altered from any source distribution.
++ */
++
++/* FLOAT FORWARD DCT */
++
++#include "jsimd_e2k.h"
++
++#define DO_FDCT(in, out) { \
++  tmp0 = _mm_add_ps(in##0, in##7); \
++  tmp7 = _mm_sub_ps(in##0, in##7); \
++  tmp1 = _mm_add_ps(in##1, in##6); \
++  tmp6 = _mm_sub_ps(in##1, in##6); \
++  tmp2 = _mm_add_ps(in##2, in##5); \
++  tmp5 = _mm_sub_ps(in##2, in##5); \
++  tmp3 = _mm_add_ps(in##3, in##4); \
++  tmp4 = _mm_sub_ps(in##3, in##4); \
++  \
++  /* Even part */ \
++  \
++  tmp10 = _mm_add_ps(tmp0, tmp3); \
++  tmp13 = _mm_sub_ps(tmp0, tmp3); \
++  tmp11 = _mm_add_ps(tmp1, tmp2); \
++  tmp12 = _mm_sub_ps(tmp1, tmp2); \
++  \
++  out##0 = _mm_add_ps(tmp10, tmp11); \
++  out##4 = _mm_sub_ps(tmp10, tmp11); \
++  \
++  z1 = _mm_mul_ps(_mm_add_ps(tmp12, tmp13), pd_f0707); \
++  out##2 = _mm_add_ps(tmp13, z1); \
++  out##6 = _mm_sub_ps(tmp13, z1); \
++  \
++  /* Odd part */ \
++  \
++  tmp10 = _mm_add_ps(tmp4, tmp5); \
++  tmp11 = _mm_add_ps(tmp5, tmp6); \
++  tmp12 = _mm_add_ps(tmp6, tmp7); \
++  \
++  z5 = _mm_mul_ps(_mm_sub_ps(tmp10, tmp12), pd_f0382); \
++  z2 = _mm_add_ps(_mm_mul_ps(tmp10, pd_f0541), z5); \
++  z4 = _mm_add_ps(_mm_mul_ps(tmp12, pd_f1306), z5); \
++  z3 = _mm_mul_ps(tmp11, pd_f0707); \
++  \
++  z11 = _mm_add_ps(tmp7, z3); \
++  z13 = _mm_sub_ps(tmp7, z3); \
++  \
++  out##5 = _mm_add_ps(z13, z2); \
++  out##3 = _mm_sub_ps(z13, z2); \
++  out##1 = _mm_add_ps(z11, z4); \
++  out##7 = _mm_sub_ps(z11, z4); \
++}
++
++#define LOAD_DATA(a, b, c, d, l, i) \
++  l##a = _mm_loadu_ps(data + a * 8 + i); \
++  l##b = _mm_loadu_ps(data + b * 8 + i); \
++  l##c = _mm_loadu_ps(data + c * 8 + i); \
++  l##d = _mm_loadu_ps(data + d * 8 + i);
++
++#define STORE_DATA(a, b, c, d, l, i) \
++  _mm_storeu_ps(data + a * 8 + i, l##a); \
++  _mm_storeu_ps(data + b * 8 + i, l##b); \
++  _mm_storeu_ps(data + c * 8 + i, l##c); \
++  _mm_storeu_ps(data + d * 8 + i, l##d);
++
++
++void jsimd_fdct_float_e2k(FAST_FLOAT *data)
++{
++  __m128 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
++    tmp10, tmp11, tmp12, tmp13, z1, z2, z3, z4, z5, z11, z13;
++  __m128 l0, l1, l2, l3, l4, l5, l6, l7;
++  __m128 h0, h1, h2, h3, h4, h5, h6, h7;
++  __m128 x0, x1, x2, x3, x4, x5, x6, x7;
++  __m128 y0, y1, y2, y3, y4, y5, y6, y7;
++
++  /* Constants */
++  __m128 pd_f0382 = _mm_set1_ps(0.382683433f),
++    pd_f0541 = _mm_set1_ps(0.541196100f),
++    pd_f0707 = _mm_set1_ps(0.707106781f),
++    pd_f1306 = _mm_set1_ps(1.306562965f);
++
++  /* Pass 1: process columns */
++
++  LOAD_DATA(0, 1, 2, 3, x, 0)
++  LOAD_DATA(0, 1, 2, 3, y, 4)
++  TRANSPOSE_FLOAT(x0, x1, x2, x3, l0, l1, l2, l3)
++  TRANSPOSE_FLOAT(y0, y1, y2, y3, l4, l5, l6, l7)
++  DO_FDCT(l, l);
++
++  LOAD_DATA(4, 5, 6, 7, x, 0)
++  LOAD_DATA(4, 5, 6, 7, y, 4)
++  TRANSPOSE_FLOAT(x4, x5, x6, x7, h0, h1, h2, h3)
++  TRANSPOSE_FLOAT(y4, y5, y6, y7, h4, h5, h6, h7)
++  DO_FDCT(h, h);
++
++  /* Pass 2: process rows */
++
++  TRANSPOSE_FLOAT(l0, l1, l2, l3, x0, x1, x2, x3)
++  TRANSPOSE_FLOAT(h0, h1, h2, h3, x4, x5, x6, x7)
++  DO_FDCT(x, x);
++  STORE_DATA(0, 1, 2, 3, x, 0)
++  STORE_DATA(4, 5, 6, 7, x, 0)
++
++  TRANSPOSE_FLOAT(l4, l5, l6, l7, y0, y1, y2, y3)
++  TRANSPOSE_FLOAT(h4, h5, h6, h7, y4, y5, y6, y7)
++  DO_FDCT(y, y);
++  STORE_DATA(0, 1, 2, 3, y, 4)
++  STORE_DATA(4, 5, 6, 7, y, 4)
++}
+diff --git a/simd/e2k/jfdctfst-e2k.c b/simd/e2k/jfdctfst-e2k.c
+new file mode 100644
+index 0000000..9e58f05
+--- /dev/null
++++ b/simd/e2k/jfdctfst-e2k.c
+@@ -0,0 +1,145 @@
++/*
++ * Elbrus optimizations for libjpeg-turbo
++ *
++ * Copyright (C) 2014, D. R. Commander.  All Rights Reserved.
++ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
++ *
++ * This software is provided 'as-is', without any express or implied
++ * warranty.  In no event will the authors be held liable for any damages
++ * arising from the use of this software.
++ *
++ * Permission is granted to anyone to use this software for any purpose,
++ * including commercial applications, and to alter it and redistribute it
++ * freely, subject to the following restrictions:
++ *
++ * 1. The origin of this software must not be misrepresented; you must not
++ *    claim that you wrote the original software. If you use this software
++ *    in a product, an acknowledgment in the product documentation would be
++ *    appreciated but is not required.
++ * 2. Altered source versions must be plainly marked as such, and must not be
++ *    misrepresented as being the original software.
++ * 3. This notice may not be removed or altered from any source distribution.
++ */
++
++/* FAST INTEGER FORWARD DCT */
++
++#include "jsimd_e2k.h"
++
++
++#define F_0_382  98   /* FIX(0.382683433) */
++#define F_0_541  139  /* FIX(0.541196100) */
++#define F_0_707  181  /* FIX(0.707106781) */
++#define F_1_306  334  /* FIX(1.306562965) */
++
++#define CONST_BITS  8
++#define PRE_MULTIPLY_SCALE_BITS  2
++#define CONST_SHIFT  (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
++
++
++#define DO_FDCT() { \
++  /* Even part */ \
++  \
++  tmp10 = _mm_add_epi16(tmp0, tmp3); \
++  tmp13 = _mm_sub_epi16(tmp0, tmp3); \
++  tmp11 = _mm_add_epi16(tmp1, tmp2); \
++  tmp12 = _mm_sub_epi16(tmp1, tmp2); \
++  \
++  out0  = _mm_add_epi16(tmp10, tmp11); \
++  out4  = _mm_sub_epi16(tmp10, tmp11); \
++  \
++  z1 = _mm_add_epi16(tmp12, tmp13); \
++  z1 = _mm_slli_epi16(z1, PRE_MULTIPLY_SCALE_BITS); \
++  z1 = _mm_mulhi_epi16(z1, pw_0707); \
++  \
++  out2 = _mm_add_epi16(tmp13, z1); \
++  out6 = _mm_sub_epi16(tmp13, z1); \
++  \
++  /* Odd part */ \
++  \
++  tmp10 = _mm_add_epi16(tmp4, tmp5); \
++  tmp11 = _mm_add_epi16(tmp5, tmp6); \
++  tmp12 = _mm_add_epi16(tmp6, tmp7); \
++  \
++  tmp10 = _mm_slli_epi16(tmp10, PRE_MULTIPLY_SCALE_BITS); \
++  tmp12 = _mm_slli_epi16(tmp12, PRE_MULTIPLY_SCALE_BITS); \
++  z5 = _mm_sub_epi16(tmp10, tmp12); \
++  z5 = _mm_mulhi_epi16(z5, pw_0382); \
++  \
++  z2 = _mm_add_epi16(_mm_mulhi_epi16(tmp10, pw_0541), z5); \
++  z4 = _mm_add_epi16(_mm_mulhi_epi16(tmp12, pw_1306), z5); \
++  \
++  tmp11 = _mm_slli_epi16(tmp11, PRE_MULTIPLY_SCALE_BITS); \
++  z3 = _mm_mulhi_epi16(tmp11, pw_0707); \
++  \
++  z11 = _mm_add_epi16(tmp7, z3); \
++  z13 = _mm_sub_epi16(tmp7, z3); \
++  \
++  out5 = _mm_add_epi16(z13, z2); \
++  out3 = _mm_sub_epi16(z13, z2); \
++  out1 = _mm_add_epi16(z11, z4); \
++  out7 = _mm_sub_epi16(z11, z4); \
++}
++
++
++void jsimd_fdct_ifast_e2k(DCTELEM *data)
++{
++  __m128i row0, row1, row2, row3, row4, row5, row6, row7,
++    col0, col1, col2, col3, col4, col5, col6, col7,
++    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
++    z1, z2, z3, z4, z5, z11, z13,
++    out0, out1, out2, out3, out4, out5, out6, out7;
++
++  /* Constants */
++  __m128i pw_0382 = _mm_set1_epi16(F_0_382 << CONST_SHIFT),
++    pw_0541 = _mm_set1_epi16(F_0_541 << CONST_SHIFT),
++    pw_0707 = _mm_set1_epi16(F_0_707 << CONST_SHIFT),
++    pw_1306 = _mm_set1_epi16(F_1_306 << CONST_SHIFT);
++
++  /* Pass 1: process rows */
++
++  row0 = VEC_LD(data + 0 * 8);
++  row1 = VEC_LD(data + 1 * 8);
++  row2 = VEC_LD(data + 2 * 8);
++  row3 = VEC_LD(data + 3 * 8);
++  row4 = VEC_LD(data + 4 * 8);
++  row5 = VEC_LD(data + 5 * 8);
++  row6 = VEC_LD(data + 6 * 8);
++  row7 = VEC_LD(data + 7 * 8);
++
++  TRANSPOSE(row, col);
++
++  tmp0 = _mm_add_epi16(col0, col7);
++  tmp7 = _mm_sub_epi16(col0, col7);
++  tmp1 = _mm_add_epi16(col1, col6);
++  tmp6 = _mm_sub_epi16(col1, col6);
++  tmp2 = _mm_add_epi16(col2, col5);
++  tmp5 = _mm_sub_epi16(col2, col5);
++  tmp3 = _mm_add_epi16(col3, col4);
++  tmp4 = _mm_sub_epi16(col3, col4);
++
++  DO_FDCT();
++
++  /* Pass 2: process columns */
++
++  TRANSPOSE(out, row);
++
++  tmp0 = _mm_add_epi16(row0, row7);
++  tmp7 = _mm_sub_epi16(row0, row7);
++  tmp1 = _mm_add_epi16(row1, row6);
++  tmp6 = _mm_sub_epi16(row1, row6);
++  tmp2 = _mm_add_epi16(row2, row5);
++  tmp5 = _mm_sub_epi16(row2, row5);
++  tmp3 = _mm_add_epi16(row3, row4);
++  tmp4 = _mm_sub_epi16(row3, row4);
++
++  DO_FDCT();
++
++  VEC_ST(data + 0 * 8, out0);
++  VEC_ST(data + 1 * 8, out1);
++  VEC_ST(data + 2 * 8, out2);
++  VEC_ST(data + 3 * 8, out3);
++  VEC_ST(data + 4 * 8, out4);
++  VEC_ST(data + 5 * 8, out5);
++  VEC_ST(data + 6 * 8, out6);
++  VEC_ST(data + 7 * 8, out7);
++}
+diff --git a/simd/e2k/jfdctint-e2k.c b/simd/e2k/jfdctint-e2k.c
+new file mode 100644
+index 0000000..2200852
+--- /dev/null
++++ b/simd/e2k/jfdctint-e2k.c
+@@ -0,0 +1,255 @@
++/*
++ * Elbrus optimizations for libjpeg-turbo
++ *
++ * Copyright (C) 2014, 2020, D. R. Commander.  All Rights Reserved.
++ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
++ *
++ * This software is provided 'as-is', without any express or implied
++ * warranty.  In no event will the authors be held liable for any damages
++ * arising from the use of this software.
++ *
++ * Permission is granted to anyone to use this software for any purpose,
++ * including commercial applications, and to alter it and redistribute it
++ * freely, subject to the following restrictions:
++ *
++ * 1. The origin of this software must not be misrepresented; you must not
++ *    claim that you wrote the original software. If you use this software
++ *    in a product, an acknowledgment in the product documentation would be
++ *    appreciated but is not required.
++ * 2. Altered source versions must be plainly marked as such, and must not be
++ *    misrepresented as being the original software.
++ * 3. This notice may not be removed or altered from any source distribution.
++ */
++
++/* ACCURATE INTEGER FORWARD DCT */
++
++#include "jsimd_e2k.h"
++
++
++#define F_0_298  2446   /* FIX(0.298631336) */
++#define F_0_390  3196   /* FIX(0.390180644) */
++#define F_0_541  4433   /* FIX(0.541196100) */
++#define F_0_765  6270   /* FIX(0.765366865) */
++#define F_0_899  7373   /* FIX(0.899976223) */
++#define F_1_175  9633   /* FIX(1.175875602) */
++#define F_1_501  12299  /* FIX(1.501321110) */
++#define F_1_847  15137  /* FIX(1.847759065) */
++#define F_1_961  16069  /* FIX(1.961570560) */
++#define F_2_053  16819  /* FIX(2.053119869) */
++#define F_2_562  20995  /* FIX(2.562915447) */
++#define F_3_072  25172  /* FIX(3.072711026) */
++
++#define CONST_BITS  13
++#define PASS1_BITS  2
++#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
++#define DESCALE_P2  (CONST_BITS + PASS1_BITS)
++
++
++#define DO_FDCT_COMMON(PASS) { \
++  /* (Original) \
++   * z1 = (tmp12 + tmp13) * 0.541196100; \
++   * data2 = z1 + tmp13 * 0.765366865; \
++   * data6 = z1 + tmp12 * -1.847759065; \
++   * \
++   * (This implementation) \
++   * data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; \
++   * data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); \
++   */ \
++  \
++  tmp1312l = _mm_unpacklo_epi16(tmp13, tmp12); \
++  tmp1312h = _mm_unpackhi_epi16(tmp13, tmp12); \
++  \
++  out2l = _mm_add_epi32(_mm_madd_epi16(tmp1312l, pw_f130_f054), pd_descale_p##PASS); \
++  out2h = _mm_add_epi32(_mm_madd_epi16(tmp1312h, pw_f130_f054), pd_descale_p##PASS); \
++  out6l = _mm_add_epi32(_mm_madd_epi16(tmp1312l, pw_f054_mf130), pd_descale_p##PASS); \
++  out6h = _mm_add_epi32(_mm_madd_epi16(tmp1312h, pw_f054_mf130), pd_descale_p##PASS); \
++  \
++  out2l = _mm_srai_epi32(out2l, DESCALE_P##PASS); \
++  out2h = _mm_srai_epi32(out2h, DESCALE_P##PASS); \
++  out6l = _mm_srai_epi32(out6l, DESCALE_P##PASS); \
++  out6h = _mm_srai_epi32(out6h, DESCALE_P##PASS); \
++  \
++  out2 = _mm_packs_epi32(out2l, out2h); \
++  out6 = _mm_packs_epi32(out6l, out6h); \
++  \
++  /* Odd part */ \
++  \
++  z3 = _mm_add_epi16(tmp4, tmp6); \
++  z4 = _mm_add_epi16(tmp5, tmp7); \
++  \
++  /* (Original) \
++   * z5 = (z3 + z4) * 1.175875602; \
++   * z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644; \
++   * z3 += z5;  z4 += z5; \
++   * \
++   * (This implementation) \
++   * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
++   * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
++   */ \
++  \
++  z34l = _mm_unpacklo_epi16(z3, z4); \
++  z34h = _mm_unpackhi_epi16(z3, z4); \
++  \
++  z3l = _mm_add_epi32(_mm_madd_epi16(z34l, pw_mf078_f117), pd_descale_p##PASS); \
++  z3h = _mm_add_epi32(_mm_madd_epi16(z34h, pw_mf078_f117), pd_descale_p##PASS); \
++  z4l = _mm_add_epi32(_mm_madd_epi16(z34l, pw_f117_f078), pd_descale_p##PASS); \
++  z4h = _mm_add_epi32(_mm_madd_epi16(z34h, pw_f117_f078), pd_descale_p##PASS); \
++  \
++  /* (Original) \
++   * z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6; \
++   * tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869; \
++   * tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110; \
++   * z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447; \
++   * data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4; \
++   * data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4; \
++   * \
++   * (This implementation) \
++   * tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; \
++   * tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; \
++   * tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); \
++   * tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); \
++   * data7 = tmp4 + z3;  data5 = tmp5 + z4; \
++   * data3 = tmp6 + z3;  data1 = tmp7 + z4; \
++   */ \
++  \
++  tmp47l = _mm_unpacklo_epi16(tmp4, tmp7); \
++  tmp47h = _mm_unpackhi_epi16(tmp4, tmp7); \
++  \
++  out7l = _mm_add_epi32(_mm_madd_epi16(tmp47l, pw_mf060_mf089), z3l); \
++  out7h = _mm_add_epi32(_mm_madd_epi16(tmp47h, pw_mf060_mf089), z3h); \
++  out1l = _mm_add_epi32(_mm_madd_epi16(tmp47l, pw_mf089_f060), z4l); \
++  out1h = _mm_add_epi32(_mm_madd_epi16(tmp47h, pw_mf089_f060), z4h); \
++  \
++  out7l = _mm_srai_epi32(out7l, DESCALE_P##PASS); \
++  out7h = _mm_srai_epi32(out7h, DESCALE_P##PASS); \
++  out1l = _mm_srai_epi32(out1l, DESCALE_P##PASS); \
++  out1h = _mm_srai_epi32(out1h, DESCALE_P##PASS); \
++  \
++  out7 = _mm_packs_epi32(out7l, out7h); \
++  out1 = _mm_packs_epi32(out1l, out1h); \
++  \
++  tmp56l = _mm_unpacklo_epi16(tmp5, tmp6); \
++  tmp56h = _mm_unpackhi_epi16(tmp5, tmp6); \
++  \
++  out5l = _mm_add_epi32(_mm_madd_epi16(tmp56l, pw_mf050_mf256), z4l); \
++  out5h = _mm_add_epi32(_mm_madd_epi16(tmp56h, pw_mf050_mf256), z4h); \
++  out3l = _mm_add_epi32(_mm_madd_epi16(tmp56l, pw_mf256_f050), z3l); \
++  out3h = _mm_add_epi32(_mm_madd_epi16(tmp56h, pw_mf256_f050), z3h); \
++  \
++  out5l = _mm_srai_epi32(out5l, DESCALE_P##PASS); \
++  out5h = _mm_srai_epi32(out5h, DESCALE_P##PASS); \
++  out3l = _mm_srai_epi32(out3l, DESCALE_P##PASS); \
++  out3h = _mm_srai_epi32(out3h, DESCALE_P##PASS); \
++  \
++  out5 = _mm_packs_epi32(out5l, out5h); \
++  out3 = _mm_packs_epi32(out3l, out3h); \
++}
++
++#define DO_FDCT_PASS1() { \
++  /* Even part */ \
++  \
++  tmp10 = _mm_add_epi16(tmp0, tmp3); \
++  tmp13 = _mm_sub_epi16(tmp0, tmp3); \
++  tmp11 = _mm_add_epi16(tmp1, tmp2); \
++  tmp12 = _mm_sub_epi16(tmp1, tmp2); \
++  \
++  out0  = _mm_add_epi16(tmp10, tmp11); \
++  out0  = _mm_slli_epi16(out0, PASS1_BITS); \
++  out4  = _mm_sub_epi16(tmp10, tmp11); \
++  out4  = _mm_slli_epi16(out4, PASS1_BITS); \
++  \
++  DO_FDCT_COMMON(1); \
++}
++
++#define DO_FDCT_PASS2() { \
++  /* Even part */ \
++  \
++  tmp10 = _mm_add_epi16(tmp0, tmp3); \
++  tmp13 = _mm_sub_epi16(tmp0, tmp3); \
++  tmp11 = _mm_add_epi16(tmp1, tmp2); \
++  tmp12 = _mm_sub_epi16(tmp1, tmp2); \
++  \
++  out0  = _mm_add_epi16(tmp10, tmp11); \
++  out0  = _mm_add_epi16(out0, pw_descale_p2x); \
++  out0  = _mm_srai_epi16(out0, PASS1_BITS); \
++  out4  = _mm_sub_epi16(tmp10, tmp11); \
++  out4  = _mm_add_epi16(out4, pw_descale_p2x); \
++  out4  = _mm_srai_epi16(out4, PASS1_BITS); \
++  \
++  DO_FDCT_COMMON(2); \
++}
++
++
++void jsimd_fdct_islow_e2k(DCTELEM *data)
++{
++  __m128i row0, row1, row2, row3, row4, row5, row6, row7,
++    col0, col1, col2, col3, col4, col5, col6, col7,
++    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
++    tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h,
++    z3, z4, z34l, z34h,
++    out0, out1, out2, out3, out4, out5, out6, out7;
++  __m128i z3l, z3h, z4l, z4h,
++    out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h,
++    out7l, out7h;
++
++  /* Constants */
++  __m128i pw_f130_f054 = _mm_setr_epi16(__4X2(F_0_541 + F_0_765, F_0_541)),
++    pw_f054_mf130 = _mm_setr_epi16(__4X2(F_0_541, F_0_541 - F_1_847)),
++    pw_mf078_f117 = _mm_setr_epi16(__4X2(F_1_175 - F_1_961, F_1_175)),
++    pw_f117_f078 = _mm_setr_epi16(__4X2(F_1_175, F_1_175 - F_0_390)),
++    pw_mf060_mf089 = _mm_setr_epi16(__4X2(F_0_298 - F_0_899, -F_0_899)),
++    pw_mf089_f060 = _mm_setr_epi16(__4X2(-F_0_899, F_1_501 - F_0_899)),
++    pw_mf050_mf256 = _mm_setr_epi16(__4X2(F_2_053 - F_2_562, -F_2_562)),
++    pw_mf256_f050 = _mm_setr_epi16(__4X2(-F_2_562, F_3_072 - F_2_562)),
++    pw_descale_p2x = _mm_set1_epi16(1 << (PASS1_BITS - 1)),
++    pd_descale_p1 = _mm_set1_epi32(1 << (DESCALE_P1 - 1)),
++    pd_descale_p2 = _mm_set1_epi32(1 << (DESCALE_P2 - 1));
++
++  /* Pass 1: process rows */
++
++  row0 = VEC_LD(data + 0 * 8);
++  row1 = VEC_LD(data + 1 * 8);
++  row2 = VEC_LD(data + 2 * 8);
++  row3 = VEC_LD(data + 3 * 8);
++  row4 = VEC_LD(data + 4 * 8);
++  row5 = VEC_LD(data + 5 * 8);
++  row6 = VEC_LD(data + 6 * 8);
++  row7 = VEC_LD(data + 7 * 8);
++
++  TRANSPOSE(row, col);
++
++  tmp0 = _mm_add_epi16(col0, col7);
++  tmp7 = _mm_sub_epi16(col0, col7);
++  tmp1 = _mm_add_epi16(col1, col6);
++  tmp6 = _mm_sub_epi16(col1, col6);
++  tmp2 = _mm_add_epi16(col2, col5);
++  tmp5 = _mm_sub_epi16(col2, col5);
++  tmp3 = _mm_add_epi16(col3, col4);
++  tmp4 = _mm_sub_epi16(col3, col4);
++
++  DO_FDCT_PASS1();
++
++  /* Pass 2: process columns */
++
++  TRANSPOSE(out, row);
++
++  tmp0 = _mm_add_epi16(row0, row7);
++  tmp7 = _mm_sub_epi16(row0, row7);
++  tmp1 = _mm_add_epi16(row1, row6);
++  tmp6 = _mm_sub_epi16(row1, row6);
++  tmp2 = _mm_add_epi16(row2, row5);
++  tmp5 = _mm_sub_epi16(row2, row5);
++  tmp3 = _mm_add_epi16(row3, row4);
++  tmp4 = _mm_sub_epi16(row3, row4);
++
++  DO_FDCT_PASS2();
++
++  VEC_ST(data + 0 * 8, out0);
++  VEC_ST(data + 1 * 8, out1);
++  VEC_ST(data + 2 * 8, out2);
++  VEC_ST(data + 3 * 8, out3);
++  VEC_ST(data + 4 * 8, out4);
++  VEC_ST(data + 5 * 8, out5);
++  VEC_ST(data + 6 * 8, out6);
++  VEC_ST(data + 7 * 8, out7);
++}
+diff --git a/simd/e2k/jidctflt-e2k.c b/simd/e2k/jidctflt-e2k.c
+new file mode 100644
+index 0000000..7682965
+--- /dev/null
++++ b/simd/e2k/jidctflt-e2k.c
+@@ -0,0 +1,215 @@
++/*
++ * Elbrus optimizations for libjpeg-turbo
++ *
++ * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
++ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
++ *
++ * This software is provided 'as-is', without any express or implied
++ * warranty.  In no event will the authors be held liable for any damages
++ * arising from the use of this software.
++ *
++ * Permission is granted to anyone to use this software for any purpose,
++ * including commercial applications, and to alter it and redistribute it
++ * freely, subject to the following restrictions:
++ *
++ * 1. The origin of this software must not be misrepresented; you must not
++ *    claim that you wrote the original software. If you use this software
++ *    in a product, an acknowledgment in the product documentation would be
++ *    appreciated but is not required.
++ * 2. Altered source versions must be plainly marked as such, and must not be
++ *    misrepresented as being the original software.
++ * 3. This notice may not be removed or altered from any source distribution.
++ */
++
++/* FLOAT INVERSE DCT */
++
++#include "jsimd_e2k.h"
++
++#define DO_IDCT(in, out) { \
++  /* Even part */ \
++  \
++  tmp10 = _mm_add_ps(in##0, in##4); \
++  tmp11 = _mm_sub_ps(in##0, in##4); \
++  \
++  tmp13 = _mm_add_ps(in##2, in##6); \
++  tmp12 = _mm_sub_ps(in##2, in##6); \
++  tmp12 = _mm_sub_ps(_mm_mul_ps(tmp12, pd_f1414), tmp13); \
++  \
++  tmp0 = _mm_add_ps(tmp10, tmp13); \
++  tmp3 = _mm_sub_ps(tmp10, tmp13); \
++  tmp1 = _mm_add_ps(tmp11, tmp12); \
++  tmp2 = _mm_sub_ps(tmp11, tmp12); \
++  \
++  /* Odd part */ \
++  \
++  z13 = _mm_add_ps(in##5, in##3); \
++  z10 = _mm_sub_ps(in##5, in##3); \
++  z11 = _mm_add_ps(in##1, in##7); \
++  z12 = _mm_sub_ps(in##1, in##7); \
++  \
++  tmp7 = _mm_add_ps(z11, z13); \
++  tmp11 = _mm_sub_ps(z11, z13); \
++  tmp11 = _mm_mul_ps(tmp11, pd_f1414); \
++  \
++  z5 = _mm_mul_ps(_mm_add_ps(z10, z12), pd_f1847); \
++  tmp10 = _mm_sub_ps(z5, _mm_mul_ps(z12, pd_f1082)); \
++  tmp12 = _mm_sub_ps(z5, _mm_mul_ps(z10, pd_f2613)); \
++  \
++  tmp6 = _mm_sub_ps(tmp12, tmp7); \
++  tmp5 = _mm_sub_ps(tmp11, tmp6); \
++  tmp4 = _mm_sub_ps(tmp10, tmp5); \
++  \
++  out##0 = _mm_add_ps(tmp0, tmp7); \
++  out##7 = _mm_sub_ps(tmp0, tmp7); \
++  out##1 = _mm_add_ps(tmp1, tmp6); \
++  out##6 = _mm_sub_ps(tmp1, tmp6); \
++  out##2 = _mm_add_ps(tmp2, tmp5); \
++  out##5 = _mm_sub_ps(tmp2, tmp5); \
++  out##3 = _mm_add_ps(tmp3, tmp4); \
++  out##4 = _mm_sub_ps(tmp3, tmp4); \
++}
++
++#define QUANT_MUL(a, b, c, d, l, lo, i) \
++  out0 = _mm_srai_epi32(_mm_unpack##lo##_epi16(col##a, col##a), 16); \
++  out1 = _mm_srai_epi32(_mm_unpack##lo##_epi16(col##b, col##b), 16); \
++  out2 = _mm_srai_epi32(_mm_unpack##lo##_epi16(col##c, col##c), 16); \
++  out3 = _mm_srai_epi32(_mm_unpack##lo##_epi16(col##d, col##d), 16); \
++  l##a = _mm_cvtepi32_ps(out0); \
++  l##b = _mm_cvtepi32_ps(out1); \
++  l##c = _mm_cvtepi32_ps(out2); \
++  l##d = _mm_cvtepi32_ps(out3); \
++  l##a = _mm_mul_ps(l##a, _mm_load_ps(dct_table + a * 8 + i)); \
++  l##b = _mm_mul_ps(l##b, _mm_load_ps(dct_table + b * 8 + i)); \
++  l##c = _mm_mul_ps(l##c, _mm_load_ps(dct_table + c * 8 + i)); \
++  l##d = _mm_mul_ps(l##d, _mm_load_ps(dct_table + d * 8 + i));
++
++
++void jsimd_idct_float_e2k(void *dct_table_, JCOEFPTR coef_block,
++                          JSAMPARRAY output_buf, JDIMENSION output_col)
++{
++  float *dct_table = (float *)dct_table_;
++
++  __m128i col0, col1, col2, col3, col4, col5, col6, col7,
++    out0, out1, out2, out3, out4, out5, out6, out7, row0, row1, row2, row3;
++  __m128 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
++    tmp10, tmp11, tmp12, tmp13, z5, z10, z11, z12, z13;
++  __m128 l0, l1, l2, l3, l4, l5, l6, l7;
++  __m128 h0, h1, h2, h3, h4, h5, h6, h7;
++  __m128 x0, x1, x2, x3, x4, x5, x6, x7;
++  __m128 y0, y1, y2, y3, y4, y5, y6, y7;
++
++  /* Constants */
++  __m128 pd_f1414 = _mm_set1_ps(1.414213562f),
++    pd_f1847 = _mm_set1_ps(1.847759065f),
++    pd_f1082 = _mm_set1_ps(1.082392200f),
++    pd_f2613 = _mm_set1_ps(2.613125930f);
++
++  /* Pass 1: process columns */
++
++  col0 = VEC_LD(coef_block + 0 * 8);
++  col1 = VEC_LD(coef_block + 1 * 8);
++  col2 = VEC_LD(coef_block + 2 * 8);
++  col3 = VEC_LD(coef_block + 3 * 8);
++  col4 = VEC_LD(coef_block + 4 * 8);
++  col5 = VEC_LD(coef_block + 5 * 8);
++  col6 = VEC_LD(coef_block + 6 * 8);
++  col7 = VEC_LD(coef_block + 7 * 8);
++
++  out1 = _mm_or_si128(col1, col2);
++  out2 = _mm_or_si128(col3, col4);
++  out1 = _mm_or_si128(out1, out2);
++  out3 = _mm_or_si128(col5, col6);
++  out3 = _mm_or_si128(out3, col7);
++  out1 = _mm_or_si128(out1, out3);
++
++  if (VEC_ISZERO(out1)) {
++    /* AC terms all zero */
++
++    out0 = _mm_srai_epi32(_mm_unpacklo_epi16(col0, col0), 16);
++    out1 = _mm_srai_epi32(_mm_unpackhi_epi16(col0, col0), 16);
++    tmp0 = _mm_cvtepi32_ps(out0);
++    tmp1 = _mm_cvtepi32_ps(out1);
++    tmp0 = _mm_mul_ps(tmp0, _mm_load_ps(dct_table));
++    tmp1 = _mm_mul_ps(tmp1, _mm_load_ps(dct_table + 4));
++
++    l0 = h0 = _mm_shuffle_ps(tmp0, tmp0, 0x00);
++    l1 = h1 = _mm_shuffle_ps(tmp0, tmp0, 0x55);
++    l2 = h2 = _mm_shuffle_ps(tmp0, tmp0, 0xaa);
++    l3 = h3 = _mm_shuffle_ps(tmp0, tmp0, 0xff);
++    l4 = h4 = _mm_shuffle_ps(tmp1, tmp1, 0x00);
++    l5 = h5 = _mm_shuffle_ps(tmp1, tmp1, 0x55);
++    l6 = h6 = _mm_shuffle_ps(tmp1, tmp1, 0xaa);
++    l7 = h7 = _mm_shuffle_ps(tmp1, tmp1, 0xff);
++
++  } else {
++
++    QUANT_MUL(0, 2, 4, 6, l, lo, 0)
++    QUANT_MUL(1, 3, 5, 7, l, lo, 0)
++    DO_IDCT(l, x);
++
++    QUANT_MUL(0, 2, 4, 6, h, hi, 4)
++    QUANT_MUL(1, 3, 5, 7, h, hi, 4)
++    DO_IDCT(h, y);
++
++    TRANSPOSE_FLOAT(x0, x1, x2, x3, l0, l1, l2, l3)
++    TRANSPOSE_FLOAT(x4, x5, x6, x7, h0, h1, h2, h3)
++    TRANSPOSE_FLOAT(y0, y1, y2, y3, l4, l5, l6, l7)
++    TRANSPOSE_FLOAT(y4, y5, y6, y7, h4, h5, h6, h7)
++  }
++
++  /* Pass 2: process rows */
++
++  DO_IDCT(l, x);
++  DO_IDCT(h, y);
++
++#ifdef JSIMD_SAME_ROUNDING
++#define OUT_ROUND(i) \
++  tmp0 = _mm_add_ps(_mm_mul_ps(x##i, pd_f0125), pd_cj_rnd); \
++  tmp1 = _mm_add_ps(_mm_mul_ps(y##i, pd_f0125), pd_cj_rnd); \
++  out##i = _mm_packs_epi32(_mm_cvttps_epi32(tmp0), _mm_cvttps_epi32(tmp1));
++
++  {
++    __m128 pd_cj_rnd = _mm_set1_ps(0.5f + CENTERJSAMPLE),
++      pd_f0125 = _mm_set1_ps(0.125f);
++
++    OUT_ROUND(0) OUT_ROUND(1)
++    OUT_ROUND(2) OUT_ROUND(3)
++    OUT_ROUND(4) OUT_ROUND(5)
++    OUT_ROUND(6) OUT_ROUND(7)
++  }
++  row0 = _mm_packus_epi16(out0, out1);
++  row1 = _mm_packus_epi16(out2, out3);
++  row2 = _mm_packus_epi16(out4, out5);
++  row3 = _mm_packus_epi16(out6, out7);
++
++  TRANSPOSE8(row, col) TRANSPOSE8(col, row) TRANSPOSE8(row, col)
++#else  /* faster, slightly differ in rounding */
++#define OUT_ROUND(i, a, b) out##i = _mm_blendv_epi8( \
++  _mm_slli_epi32(_mm_castps_si128(_mm_add_ps(b, pd_round)), 16), \
++  _mm_castps_si128(_mm_add_ps(a, pd_round)), pd_mask);
++
++  {
++    __m128i pd_mask = _mm_set1_epi32(0xffff);
++    __m128 pd_round = _mm_set1_ps((3 << 22 | CENTERJSAMPLE) * 8);
++
++    OUT_ROUND(0, x0, x4) OUT_ROUND(1, y0, y4)
++    OUT_ROUND(2, x1, x5) OUT_ROUND(3, y1, y5)
++    OUT_ROUND(4, x2, x6) OUT_ROUND(5, y2, y6)
++    OUT_ROUND(6, x3, x7) OUT_ROUND(7, y3, y7)
++  }
++  row0 = _mm_packus_epi16(out0, out1);
++  row1 = _mm_packus_epi16(out2, out3);
++  row2 = _mm_packus_epi16(out4, out5);
++  row3 = _mm_packus_epi16(out6, out7);
++
++  TRANSPOSE8(row, out) TRANSPOSE8(out, col)
++#endif
++  VEC_STL(output_buf[0] + output_col, col0);
++  VEC_STH(output_buf[1] + output_col, col0);
++  VEC_STL(output_buf[2] + output_col, col1);
++  VEC_STH(output_buf[3] + output_col, col1);
++  VEC_STL(output_buf[4] + output_col, col2);
++  VEC_STH(output_buf[5] + output_col, col2);
++  VEC_STL(output_buf[6] + output_col, col3);
++  VEC_STH(output_buf[7] + output_col, col3);
++}
+diff --git a/simd/e2k/jidctfst-e2k.c b/simd/e2k/jidctfst-e2k.c
+new file mode 100644
+index 0000000..18bc425
+--- /dev/null
++++ b/simd/e2k/jidctfst-e2k.c
+@@ -0,0 +1,187 @@
++/*
++ * Elbrus optimizations for libjpeg-turbo
++ *
++ * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
++ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
++ *
++ * This software is provided 'as-is', without any express or implied
++ * warranty.  In no event will the authors be held liable for any damages
++ * arising from the use of this software.
++ *
++ * Permission is granted to anyone to use this software for any purpose,
++ * including commercial applications, and to alter it and redistribute it
++ * freely, subject to the following restrictions:
++ *
++ * 1. The origin of this software must not be misrepresented; you must not
++ *    claim that you wrote the original software. If you use this software
++ *    in a product, an acknowledgment in the product documentation would be
++ *    appreciated but is not required.
++ * 2. Altered source versions must be plainly marked as such, and must not be
++ *    misrepresented as being the original software.
++ * 3. This notice may not be removed or altered from any source distribution.
++ */
++
++/* FAST INTEGER INVERSE DCT */
++
++#include "jsimd_e2k.h"
++
++
++#define F_1_082  277              /* FIX(1.082392200) */
++#define F_1_414  362              /* FIX(1.414213562) */
++#define F_1_847  473              /* FIX(1.847759065) */
++#define F_2_613  669              /* FIX(2.613125930) */
++#define F_1_613  (F_2_613 - 256)  /* FIX(2.613125930) - FIX(1) */
++
++#define CONST_BITS  8
++#define PASS1_BITS  2
++#define PRE_MULTIPLY_SCALE_BITS  2
++#define CONST_SHIFT  (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
++
++
++#define DO_IDCT(in) { \
++  /* Even part */ \
++  \
++  tmp10 = _mm_add_epi16(in##0, in##4); \
++  tmp11 = _mm_sub_epi16(in##0, in##4); \
++  tmp13 = _mm_add_epi16(in##2, in##6); \
++  \
++  tmp12 = _mm_sub_epi16(in##2, in##6); \
++  tmp12 = _mm_slli_epi16(tmp12, PRE_MULTIPLY_SCALE_BITS); \
++  tmp12 = _mm_mulhi_epi16(tmp12, pw_F1414); \
++  tmp12 = _mm_sub_epi16(tmp12, tmp13); \
++  \
++  tmp0 = _mm_add_epi16(tmp10, tmp13); \
++  tmp3 = _mm_sub_epi16(tmp10, tmp13); \
++  tmp1 = _mm_add_epi16(tmp11, tmp12); \
++  tmp2 = _mm_sub_epi16(tmp11, tmp12); \
++  \
++  /* Odd part */ \
++  \
++  z13 = _mm_add_epi16(in##5, in##3); \
++  z10 = _mm_sub_epi16(in##5, in##3); \
++  z10s = _mm_slli_epi16(z10, PRE_MULTIPLY_SCALE_BITS); \
++  z11 = _mm_add_epi16(in##1, in##7); \
++  z12s = _mm_sub_epi16(in##1, in##7); \
++  z12s = _mm_slli_epi16(z12s, PRE_MULTIPLY_SCALE_BITS); \
++  \
++  tmp11 = _mm_sub_epi16(z11, z13); \
++  tmp11 = _mm_slli_epi16(tmp11, PRE_MULTIPLY_SCALE_BITS); \
++  tmp11 = _mm_mulhi_epi16(tmp11, pw_F1414); \
++  \
++  tmp7 = _mm_add_epi16(z11, z13); \
++  \
++  /* To avoid overflow... \
++   * \
++   * (Original) \
++   * tmp12 = -2.613125930 * z10 + z5; \
++   * \
++   * (This implementation) \
++   * tmp12 = (-1.613125930 - 1) * z10 + z5; \
++   *       = -1.613125930 * z10 - z10 + z5; \
++   */ \
++  \
++  z5 = _mm_add_epi16(z10s, z12s); \
++  z5 = _mm_mulhi_epi16(z5, pw_F1847); \
++  \
++  tmp10 = _mm_mulhi_epi16(z12s, pw_F1082); \
++  tmp10 = _mm_sub_epi16(tmp10, z5); \
++  tmp12 = _mm_add_epi16(_mm_mulhi_epi16(z10s, pw_MF1613), z5); \
++  tmp12 = _mm_sub_epi16(tmp12, z10); \
++  \
++  tmp6 = _mm_sub_epi16(tmp12, tmp7); \
++  tmp5 = _mm_sub_epi16(tmp11, tmp6); \
++  tmp4 = _mm_add_epi16(tmp10, tmp5); \
++  \
++  out0 = _mm_add_epi16(tmp0, tmp7); \
++  out1 = _mm_add_epi16(tmp1, tmp6); \
++  out2 = _mm_add_epi16(tmp2, tmp5); \
++  out3 = _mm_sub_epi16(tmp3, tmp4); \
++  out4 = _mm_add_epi16(tmp3, tmp4); \
++  out5 = _mm_sub_epi16(tmp2, tmp5); \
++  out6 = _mm_sub_epi16(tmp1, tmp6); \
++  out7 = _mm_sub_epi16(tmp0, tmp7); \
++}
++
++
++void jsimd_idct_ifast_e2k(void *dct_table_, JCOEFPTR coef_block,
++                          JSAMPARRAY output_buf, JDIMENSION output_col)
++{
++  short *dct_table = (short *)dct_table_;
++
++  __m128i row0, row1, row2, row3, row4, row5, row6, row7,
++    col0, col1, col2, col3, col4, col5, col6, col7,
++    quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
++    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
++    z5, z10, z10s, z11, z12s, z13,
++    out0, out1, out2, out3, out4, out5, out6, out7;
++
++  /* Constants */
++  __m128i pw_F1414 = _mm_set1_epi16(F_1_414 << CONST_SHIFT),
++    pw_F1847 = _mm_set1_epi16(F_1_847 << CONST_SHIFT),
++    pw_MF1613 = _mm_set1_epi16(-F_1_613 << CONST_SHIFT),
++    pw_F1082 = _mm_set1_epi16(F_1_082 << CONST_SHIFT);
++
++  /* Pass 1: process columns */
++
++  col0 = VEC_LD(coef_block + 0 * 8);
++  col1 = VEC_LD(coef_block + 1 * 8);
++  col2 = VEC_LD(coef_block + 2 * 8);
++  col3 = VEC_LD(coef_block + 3 * 8);
++  col4 = VEC_LD(coef_block + 4 * 8);
++  col5 = VEC_LD(coef_block + 5 * 8);
++  col6 = VEC_LD(coef_block + 6 * 8);
++  col7 = VEC_LD(coef_block + 7 * 8);
++
++  tmp1 = _mm_or_si128(col1, col2);
++  tmp2 = _mm_or_si128(col3, col4);
++  tmp1 = _mm_or_si128(tmp1, tmp2);
++  tmp3 = _mm_or_si128(col5, col6);
++  tmp3 = _mm_or_si128(tmp3, col7);
++  tmp1 = _mm_or_si128(tmp1, tmp3);
++
++  quant0 = VEC_LD(dct_table);
++  col0 = _mm_mullo_epi16(col0, quant0);
++
++  if (VEC_ISZERO(tmp1)) {
++    /* AC terms all zero */
++
++    IDCT_SPLAT8(col0);
++
++  } else {
++
++    quant1 = VEC_LD(dct_table + 1 * 8);
++    quant2 = VEC_LD(dct_table + 2 * 8);
++    quant3 = VEC_LD(dct_table + 3 * 8);
++    quant4 = VEC_LD(dct_table + 4 * 8);
++    quant5 = VEC_LD(dct_table + 5 * 8);
++    quant6 = VEC_LD(dct_table + 6 * 8);
++    quant7 = VEC_LD(dct_table + 7 * 8);
++
++    col1 = _mm_mullo_epi16(col1, quant1);
++    col2 = _mm_mullo_epi16(col2, quant2);
++    col3 = _mm_mullo_epi16(col3, quant3);
++    col4 = _mm_mullo_epi16(col4, quant4);
++    col5 = _mm_mullo_epi16(col5, quant5);
++    col6 = _mm_mullo_epi16(col6, quant6);
++    col7 = _mm_mullo_epi16(col7, quant7);
++
++    DO_IDCT(col);
++
++    TRANSPOSE(out, row);
++  }
++
++  /* Pass 2: process rows */
++
++  DO_IDCT(row);
++
++  out0 = _mm_srai_epi16(out0, PASS1_BITS + 3);
++  out1 = _mm_srai_epi16(out1, PASS1_BITS + 3);
++  out2 = _mm_srai_epi16(out2, PASS1_BITS + 3);
++  out3 = _mm_srai_epi16(out3, PASS1_BITS + 3);
++  out4 = _mm_srai_epi16(out4, PASS1_BITS + 3);
++  out5 = _mm_srai_epi16(out5, PASS1_BITS + 3);
++  out6 = _mm_srai_epi16(out6, PASS1_BITS + 3);
++  out7 = _mm_srai_epi16(out7, PASS1_BITS + 3);
++
++  IDCT_SAVE();
++}
+diff --git a/simd/e2k/jidctint-e2k.c b/simd/e2k/jidctint-e2k.c
+new file mode 100644
+index 0000000..7bb79c0
+--- /dev/null
++++ b/simd/e2k/jidctint-e2k.c
+@@ -0,0 +1,294 @@
++/*
++ * Elbrus optimizations for libjpeg-turbo
++ *
++ * Copyright (C) 2014-2015, 2020, D. R. Commander.  All Rights Reserved.
++ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
++ *
++ * This software is provided 'as-is', without any express or implied
++ * warranty.  In no event will the authors be held liable for any damages
++ * arising from the use of this software.
++ *
++ * Permission is granted to anyone to use this software for any purpose,
++ * including commercial applications, and to alter it and redistribute it
++ * freely, subject to the following restrictions:
++ *
++ * 1. The origin of this software must not be misrepresented; you must not
++ *    claim that you wrote the original software. If you use this software
++ *    in a product, an acknowledgment in the product documentation would be
++ *    appreciated but is not required.
++ * 2. Altered source versions must be plainly marked as such, and must not be
++ *    misrepresented as being the original software.
++ * 3. This notice may not be removed or altered from any source distribution.
++ */
++
++/* ACCURATE INTEGER INVERSE DCT */
++
++#include "jsimd_e2k.h"
++
++
++#define F_0_298  2446   /* FIX(0.298631336) */
++#define F_0_390  3196   /* FIX(0.390180644) */
++#define F_0_541  4433   /* FIX(0.541196100) */
++#define F_0_765  6270   /* FIX(0.765366865) */
++#define F_0_899  7373   /* FIX(0.899976223) */
++#define F_1_175  9633   /* FIX(1.175875602) */
++#define F_1_501  12299  /* FIX(1.501321110) */
++#define F_1_847  15137  /* FIX(1.847759065) */
++#define F_1_961  16069  /* FIX(1.961570560) */
++#define F_2_053  16819  /* FIX(2.053119869) */
++#define F_2_562  20995  /* FIX(2.562915447) */
++#define F_3_072  25172  /* FIX(3.072711026) */
++
++#define CONST_BITS  13
++#define PASS1_BITS  2
++#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
++#define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
++
++
++#define DO_IDCT(in, PASS) { \
++  /* Even part \
++   * \
++   * (Original) \
++   * z1 = (z2 + z3) * 0.541196100; \
++   * tmp2 = z1 + z3 * -1.847759065; \
++   * tmp3 = z1 + z2 * 0.765366865; \
++   * \
++   * (This implementation) \
++   * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \
++   * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \
++   */ \
++  \
++  in##26l = _mm_unpacklo_epi16(in##2, in##6); \
++  in##26h = _mm_unpackhi_epi16(in##2, in##6); \
++  \
++  tmp3l = _mm_madd_epi16(in##26l, pw_f130_f054); \
++  tmp3h = _mm_madd_epi16(in##26h, pw_f130_f054); \
++  tmp2l = _mm_madd_epi16(in##26l, pw_f054_mf130); \
++  tmp2h = _mm_madd_epi16(in##26h, pw_f054_mf130); \
++  \
++  tmp0 = _mm_add_epi16(in##0, in##4); \
++  tmp1 = _mm_sub_epi16(in##0, in##4); \
++  \
++  tmp0l = _mm_unpacklo_epi16(pw_zero, tmp0); \
++  tmp0h = _mm_unpackhi_epi16(pw_zero, tmp0); \
++  tmp0l = _mm_srai_epi32(tmp0l, 16 - CONST_BITS); \
++  tmp0h = _mm_srai_epi32(tmp0h, 16 - CONST_BITS); \
++  tmp0l = _mm_add_epi32(tmp0l, pd_descale_p##PASS); \
++  tmp0h = _mm_add_epi32(tmp0h, pd_descale_p##PASS); \
++  \
++  tmp10l = _mm_add_epi32(tmp0l, tmp3l); \
++  tmp10h = _mm_add_epi32(tmp0h, tmp3h); \
++  tmp13l = _mm_sub_epi32(tmp0l, tmp3l); \
++  tmp13h = _mm_sub_epi32(tmp0h, tmp3h); \
++  \
++  tmp1l = _mm_unpacklo_epi16(pw_zero, tmp1); \
++  tmp1h = _mm_unpackhi_epi16(pw_zero, tmp1); \
++  tmp1l = _mm_srai_epi32(tmp1l, 16 - CONST_BITS); \
++  tmp1h = _mm_srai_epi32(tmp1h, 16 - CONST_BITS); \
++  tmp1l = _mm_add_epi32(tmp1l, pd_descale_p##PASS); \
++  tmp1h = _mm_add_epi32(tmp1h, pd_descale_p##PASS); \
++  \
++  tmp11l = _mm_add_epi32(tmp1l, tmp2l); \
++  tmp11h = _mm_add_epi32(tmp1h, tmp2h); \
++  tmp12l = _mm_sub_epi32(tmp1l, tmp2l); \
++  tmp12h = _mm_sub_epi32(tmp1h, tmp2h); \
++  \
++  /* Odd part */ \
++  \
++  z3 = _mm_add_epi16(in##3, in##7); \
++  z4 = _mm_add_epi16(in##1, in##5); \
++  \
++  /* (Original) \
++   * z5 = (z3 + z4) * 1.175875602; \
++   * z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644; \
++   * z3 += z5;  z4 += z5; \
++   * \
++   * (This implementation) \
++   * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
++   * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
++   */ \
++  \
++  z34l = _mm_unpacklo_epi16(z3, z4); \
++  z34h = _mm_unpackhi_epi16(z3, z4); \
++  \
++  z3l = _mm_madd_epi16(z34l, pw_mf078_f117); \
++  z3h = _mm_madd_epi16(z34h, pw_mf078_f117); \
++  z4l = _mm_madd_epi16(z34l, pw_f117_f078); \
++  z4h = _mm_madd_epi16(z34h, pw_f117_f078); \
++  \
++  /* (Original) \
++   * z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2; \
++   * tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869; \
++   * tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110; \
++   * z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447; \
++   * tmp0 += z1 + z3;  tmp1 += z2 + z4; \
++   * tmp2 += z2 + z3;  tmp3 += z1 + z4; \
++   * \
++   * (This implementation) \
++   * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; \
++   * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; \
++   * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); \
++   * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); \
++   * tmp0 += z3;  tmp1 += z4; \
++   * tmp2 += z3;  tmp3 += z4; \
++   */ \
++  \
++  in##71l = _mm_unpacklo_epi16(in##7, in##1); \
++  in##71h = _mm_unpackhi_epi16(in##7, in##1); \
++  \
++  tmp0l = _mm_add_epi32(_mm_madd_epi16(in##71l, pw_mf060_mf089), z3l); \
++  tmp0h = _mm_add_epi32(_mm_madd_epi16(in##71h, pw_mf060_mf089), z3h); \
++  tmp3l = _mm_add_epi32(_mm_madd_epi16(in##71l, pw_mf089_f060), z4l); \
++  tmp3h = _mm_add_epi32(_mm_madd_epi16(in##71h, pw_mf089_f060), z4h); \
++  \
++  in##53l = _mm_unpacklo_epi16(in##5, in##3); \
++  in##53h = _mm_unpackhi_epi16(in##5, in##3); \
++  \
++  tmp1l = _mm_add_epi32(_mm_madd_epi16(in##53l, pw_mf050_mf256), z4l); \
++  tmp1h = _mm_add_epi32(_mm_madd_epi16(in##53h, pw_mf050_mf256), z4h); \
++  tmp2l = _mm_add_epi32(_mm_madd_epi16(in##53l, pw_mf256_f050), z3l); \
++  tmp2h = _mm_add_epi32(_mm_madd_epi16(in##53h, pw_mf256_f050), z3h); \
++  \
++  /* Final output stage */ \
++  \
++  out0l = _mm_add_epi32(tmp10l, tmp3l); \
++  out0h = _mm_add_epi32(tmp10h, tmp3h); \
++  out7l = _mm_sub_epi32(tmp10l, tmp3l); \
++  out7h = _mm_sub_epi32(tmp10h, tmp3h); \
++  \
++  out0l = _mm_srai_epi32(out0l, DESCALE_P##PASS); \
++  out0h = _mm_srai_epi32(out0h, DESCALE_P##PASS); \
++  out7l = _mm_srai_epi32(out7l, DESCALE_P##PASS); \
++  out7h = _mm_srai_epi32(out7h, DESCALE_P##PASS); \
++  \
++  out0 = _mm_packs_epi32(out0l, out0h); \
++  out7 = _mm_packs_epi32(out7l, out7h); \
++  \
++  out1l = _mm_add_epi32(tmp11l, tmp2l); \
++  out1h = _mm_add_epi32(tmp11h, tmp2h); \
++  out6l = _mm_sub_epi32(tmp11l, tmp2l); \
++  out6h = _mm_sub_epi32(tmp11h, tmp2h); \
++  \
++  out1l = _mm_srai_epi32(out1l, DESCALE_P##PASS); \
++  out1h = _mm_srai_epi32(out1h, DESCALE_P##PASS); \
++  out6l = _mm_srai_epi32(out6l, DESCALE_P##PASS); \
++  out6h = _mm_srai_epi32(out6h, DESCALE_P##PASS); \
++  \
++  out1 = _mm_packs_epi32(out1l, out1h); \
++  out6 = _mm_packs_epi32(out6l, out6h); \
++  \
++  out2l = _mm_add_epi32(tmp12l, tmp1l); \
++  out2h = _mm_add_epi32(tmp12h, tmp1h); \
++  out5l = _mm_sub_epi32(tmp12l, tmp1l); \
++  out5h = _mm_sub_epi32(tmp12h, tmp1h); \
++  \
++  out2l = _mm_srai_epi32(out2l, DESCALE_P##PASS); \
++  out2h = _mm_srai_epi32(out2h, DESCALE_P##PASS); \
++  out5l = _mm_srai_epi32(out5l, DESCALE_P##PASS); \
++  out5h = _mm_srai_epi32(out5h, DESCALE_P##PASS); \
++  \
++  out2 = _mm_packs_epi32(out2l, out2h); \
++  out5 = _mm_packs_epi32(out5l, out5h); \
++  \
++  out3l = _mm_add_epi32(tmp13l, tmp0l); \
++  out3h = _mm_add_epi32(tmp13h, tmp0h); \
++  out4l = _mm_sub_epi32(tmp13l, tmp0l); \
++  out4h = _mm_sub_epi32(tmp13h, tmp0h); \
++  \
++  out3l = _mm_srai_epi32(out3l, DESCALE_P##PASS); \
++  out3h = _mm_srai_epi32(out3h, DESCALE_P##PASS); \
++  out4l = _mm_srai_epi32(out4l, DESCALE_P##PASS); \
++  out4h = _mm_srai_epi32(out4h, DESCALE_P##PASS); \
++  \
++  out3 = _mm_packs_epi32(out3l, out3h); \
++  out4 = _mm_packs_epi32(out4l, out4h); \
++}
++
++
++void jsimd_idct_islow_e2k(void *dct_table_, JCOEFPTR coef_block,
++                          JSAMPARRAY output_buf, JDIMENSION output_col)
++{
++  short *dct_table = (short *)dct_table_;
++
++  __m128i row0, row1, row2, row3, row4, row5, row6, row7,
++    col0, col1, col2, col3, col4, col5, col6, col7,
++    quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
++    tmp0, tmp1, tmp2, tmp3, z3, z4,
++    z34l, z34h, col71l, col71h, col26l, col26h, col53l, col53h,
++    row71l, row71h, row26l, row26h, row53l, row53h,
++    out0, out1, out2, out3, out4, out5, out6, out7;
++  __m128i tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h,
++    tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h,
++    z3l, z3h, z4l, z4h,
++    out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h, out4l, out4h,
++    out5l, out5h, out6l, out6h, out7l, out7h;
++
++  /* Constants */
++  __m128i pw_zero = _mm_setzero_si128(),
++    pw_f130_f054 = _mm_setr_epi16(__4X2(F_0_541 + F_0_765, F_0_541)),
++    pw_f054_mf130 = _mm_setr_epi16(__4X2(F_0_541, F_0_541 - F_1_847)),
++    pw_mf078_f117 = _mm_setr_epi16(__4X2(F_1_175 - F_1_961, F_1_175)),
++    pw_f117_f078 = _mm_setr_epi16(__4X2(F_1_175, F_1_175 - F_0_390)),
++    pw_mf060_mf089 = _mm_setr_epi16(__4X2(F_0_298 - F_0_899, -F_0_899)),
++    pw_mf089_f060 = _mm_setr_epi16(__4X2(-F_0_899, F_1_501 - F_0_899)),
++    pw_mf050_mf256 = _mm_setr_epi16(__4X2(F_2_053 - F_2_562, -F_2_562)),
++    pw_mf256_f050 = _mm_setr_epi16(__4X2(-F_2_562, F_3_072 - F_2_562)),
++    pd_descale_p1 = _mm_set1_epi32(1 << (DESCALE_P1 - 1)),
++    pd_descale_p2 = _mm_set1_epi32(1 << (DESCALE_P2 - 1));
++
++  /* Pass 1: process columns */
++
++  col0 = VEC_LD(coef_block + 0 * 8);
++  col1 = VEC_LD(coef_block + 1 * 8);
++  col2 = VEC_LD(coef_block + 2 * 8);
++  col3 = VEC_LD(coef_block + 3 * 8);
++  col4 = VEC_LD(coef_block + 4 * 8);
++  col5 = VEC_LD(coef_block + 5 * 8);
++  col6 = VEC_LD(coef_block + 6 * 8);
++  col7 = VEC_LD(coef_block + 7 * 8);
++
++  tmp1 = _mm_or_si128(col1, col2);
++  tmp2 = _mm_or_si128(col3, col4);
++  tmp1 = _mm_or_si128(tmp1, tmp2);
++  tmp3 = _mm_or_si128(col5, col6);
++  tmp3 = _mm_or_si128(tmp3, col7);
++  tmp1 = _mm_or_si128(tmp1, tmp3);
++
++  quant0 = VEC_LD(dct_table);
++  col0 = _mm_mullo_epi16(col0, quant0);
++
++  if (VEC_ISZERO(tmp1)) {
++    /* AC terms all zero */
++
++    col0 = _mm_slli_epi16(col0, PASS1_BITS);
++    IDCT_SPLAT8(col0);
++
++  } else {
++
++    quant1 = VEC_LD(dct_table + 1 * 8);
++    quant2 = VEC_LD(dct_table + 2 * 8);
++    quant3 = VEC_LD(dct_table + 3 * 8);
++    quant4 = VEC_LD(dct_table + 4 * 8);
++    quant5 = VEC_LD(dct_table + 5 * 8);
++    quant6 = VEC_LD(dct_table + 6 * 8);
++    quant7 = VEC_LD(dct_table + 7 * 8);
++
++    col1 = _mm_mullo_epi16(col1, quant1);
++    col2 = _mm_mullo_epi16(col2, quant2);
++    col3 = _mm_mullo_epi16(col3, quant3);
++    col4 = _mm_mullo_epi16(col4, quant4);
++    col5 = _mm_mullo_epi16(col5, quant5);
++    col6 = _mm_mullo_epi16(col6, quant6);
++    col7 = _mm_mullo_epi16(col7, quant7);
++
++    DO_IDCT(col, 1);
++
++    TRANSPOSE(out, row);
++  }
++
++  /* Pass 2: process rows */
++
++  DO_IDCT(row, 2);
++
++  IDCT_SAVE();
++}
+diff --git a/simd/e2k/jquantf-e2k.c b/simd/e2k/jquantf-e2k.c
+new file mode 100644
+index 0000000..106e99a
+--- /dev/null
++++ b/simd/e2k/jquantf-e2k.c
+@@ -0,0 +1,121 @@
++/*
++ * Elbrus optimizations for libjpeg-turbo
++ *
++ * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
++ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
++ *
++ * This software is provided 'as-is', without any express or implied
++ * warranty.  In no event will the authors be held liable for any damages
++ * arising from the use of this software.
++ *
++ * Permission is granted to anyone to use this software for any purpose,
++ * including commercial applications, and to alter it and redistribute it
++ * freely, subject to the following restrictions:
++ *
++ * 1. The origin of this software must not be misrepresented; you must not
++ *    claim that you wrote the original software. If you use this software
++ *    in a product, an acknowledgment in the product documentation would be
++ *    appreciated but is not required.
++ * 2. Altered source versions must be plainly marked as such, and must not be
++ *    misrepresented as being the original software.
++ * 3. This notice may not be removed or altered from any source distribution.
++ */
++
++/* FLOAT QUANTIZATION AND SAMPLE CONVERSION */
++
++#include "jsimd_e2k.h"
++
++#define LOAD_ROW(row) in##row = VEC_LD8(sample_data[row] + start_col)
++#define STORE_ROW(i) \
++  in0 = _mm_unpacklo_epi16(out##i, pb_zero); \
++  in1 = _mm_unpackhi_epi16(out##i, pb_zero); \
++  in0 = _mm_sub_epi32(in0, pd_cj); \
++  in1 = _mm_sub_epi32(in1, pd_cj); \
++  _mm_storeu_ps(workspace + i * 8, _mm_cvtepi32_ps(in0)); \
++  _mm_storeu_ps(workspace + i * 8 + 4, _mm_cvtepi32_ps(in1));
++
++void jsimd_convsamp_float_e2k(JSAMPARRAY sample_data, JDIMENSION start_col,
++                              FAST_FLOAT *workspace)
++{
++  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
++  __m128i out0, out1, out2, out3, out4, out5, out6, out7;
++
++  /* Constants */
++  __m128i pd_cj = _mm_set1_epi32(CENTERJSAMPLE),
++    pb_zero = _mm_setzero_si128();
++
++  LOAD_ROW(0);
++  LOAD_ROW(1);
++  LOAD_ROW(2);
++  LOAD_ROW(3);
++  LOAD_ROW(4);
++  LOAD_ROW(5);
++  LOAD_ROW(6);
++  LOAD_ROW(7);
++
++  out0 = _mm_unpacklo_epi8(in0, pb_zero);
++  out1 = _mm_unpacklo_epi8(in1, pb_zero);
++  out2 = _mm_unpacklo_epi8(in2, pb_zero);
++  out3 = _mm_unpacklo_epi8(in3, pb_zero);
++  out4 = _mm_unpacklo_epi8(in4, pb_zero);
++  out5 = _mm_unpacklo_epi8(in5, pb_zero);
++  out6 = _mm_unpacklo_epi8(in6, pb_zero);
++  out7 = _mm_unpacklo_epi8(in7, pb_zero);
++
++  STORE_ROW(0)
++  STORE_ROW(1)
++  STORE_ROW(2)
++  STORE_ROW(3)
++  STORE_ROW(4)
++  STORE_ROW(5)
++  STORE_ROW(6)
++  STORE_ROW(7)
++}
++
++void jsimd_quantize_float_e2k(JCOEFPTR coef_block, FAST_FLOAT *divisors,
++                              FAST_FLOAT *workspace)
++{
++  int i = 0;
++  __m128 row0, row1, row2, row3, recip0, recip1, recip2, recip3;
++  __m128i out0, out1;
++#ifdef JSIMD_SAME_ROUNDING
++  __m128 pd_f16k5 = _mm_set1_ps(16384.5f);
++  __m128i pw_m16k = _mm_set1_epi16(-16384);
++#endif
++
++  PRAGMA_E2K("ivdep")
++  for (; i < 4; i++, workspace += 16, divisors += 16, coef_block += 16) {
++    row0 = _mm_loadu_ps(workspace + 0 * 4);
++    row1 = _mm_loadu_ps(workspace + 1 * 4);
++    row2 = _mm_loadu_ps(workspace + 2 * 4);
++    row3 = _mm_loadu_ps(workspace + 3 * 4);
++
++    recip0 = _mm_loadu_ps(divisors + 0 * 4);
++    recip1 = _mm_loadu_ps(divisors + 1 * 4);
++    recip2 = _mm_loadu_ps(divisors + 2 * 4);
++    recip3 = _mm_loadu_ps(divisors + 3 * 4);
++
++    row0 = _mm_mul_ps(row0, recip0);
++    row1 = _mm_mul_ps(row1, recip1);
++    row2 = _mm_mul_ps(row2, recip2);
++    row3 = _mm_mul_ps(row3, recip3);
++
++#ifdef JSIMD_SAME_ROUNDING
++    row0 = _mm_add_ps(row0, pd_f16k5);
++    row1 = _mm_add_ps(row1, pd_f16k5);
++    row2 = _mm_add_ps(row2, pd_f16k5);
++    row3 = _mm_add_ps(row3, pd_f16k5);
++
++    out0 = _mm_packs_epi32(_mm_cvttps_epi32(row0), _mm_cvttps_epi32(row1));
++    out1 = _mm_packs_epi32(_mm_cvttps_epi32(row2), _mm_cvttps_epi32(row3));
++
++    out0 = _mm_add_epi16(out0, pw_m16k);
++    out1 = _mm_add_epi16(out1, pw_m16k);
++#else
++    out0 = _mm_packs_epi32(_mm_cvtps_epi32(row0), _mm_cvtps_epi32(row1));
++    out1 = _mm_packs_epi32(_mm_cvtps_epi32(row2), _mm_cvtps_epi32(row3));
++#endif
++    VEC_ST(coef_block, out0);
++    VEC_ST(coef_block + 8, out1);
++  }
++}
+diff --git a/simd/e2k/jquanti-e2k.c b/simd/e2k/jquanti-e2k.c
+new file mode 100644
+index 0000000..a3e1ff1
+--- /dev/null
++++ b/simd/e2k/jquanti-e2k.c
+@@ -0,0 +1,178 @@
++/*
++ * Elbrus optimizations for libjpeg-turbo
++ *
++ * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
++ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
++ *
++ * This software is provided 'as-is', without any express or implied
++ * warranty.  In no event will the authors be held liable for any damages
++ * arising from the use of this software.
++ *
++ * Permission is granted to anyone to use this software for any purpose,
++ * including commercial applications, and to alter it and redistribute it
++ * freely, subject to the following restrictions:
++ *
++ * 1. The origin of this software must not be misrepresented; you must not
++ *    claim that you wrote the original software. If you use this software
++ *    in a product, an acknowledgment in the product documentation would be
++ *    appreciated but is not required.
++ * 2. Altered source versions must be plainly marked as such, and must not be
++ *    misrepresented as being the original software.
++ * 3. This notice may not be removed or altered from any source distribution.
++ */
++
++/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */
++
++#include "jsimd_e2k.h"
++
++#define LOAD_ROW(row) in##row = VEC_LD8(sample_data[row] + start_col)
++
++void jsimd_convsamp_e2k(JSAMPARRAY sample_data, JDIMENSION start_col,
++                        DCTELEM *workspace)
++{
++  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
++  __m128i out0, out1, out2, out3, out4, out5, out6, out7;
++
++  /* Constants */
++  __m128i pw_cj = _mm_set1_epi16(CENTERJSAMPLE),
++    pb_zero = _mm_setzero_si128();
++
++  LOAD_ROW(0);
++  LOAD_ROW(1);
++  LOAD_ROW(2);
++  LOAD_ROW(3);
++  LOAD_ROW(4);
++  LOAD_ROW(5);
++  LOAD_ROW(6);
++  LOAD_ROW(7);
++
++  out0 = _mm_unpacklo_epi8(in0, pb_zero);
++  out1 = _mm_unpacklo_epi8(in1, pb_zero);
++  out2 = _mm_unpacklo_epi8(in2, pb_zero);
++  out3 = _mm_unpacklo_epi8(in3, pb_zero);
++  out4 = _mm_unpacklo_epi8(in4, pb_zero);
++  out5 = _mm_unpacklo_epi8(in5, pb_zero);
++  out6 = _mm_unpacklo_epi8(in6, pb_zero);
++  out7 = _mm_unpacklo_epi8(in7, pb_zero);
++
++  out0 = _mm_sub_epi16(out0, pw_cj);
++  out1 = _mm_sub_epi16(out1, pw_cj);
++  out2 = _mm_sub_epi16(out2, pw_cj);
++  out3 = _mm_sub_epi16(out3, pw_cj);
++  out4 = _mm_sub_epi16(out4, pw_cj);
++  out5 = _mm_sub_epi16(out5, pw_cj);
++  out6 = _mm_sub_epi16(out6, pw_cj);
++  out7 = _mm_sub_epi16(out7, pw_cj);
++
++  VEC_ST(workspace + 0 * 8, out0);
++  VEC_ST(workspace + 1 * 8, out1);
++  VEC_ST(workspace + 2 * 8, out2);
++  VEC_ST(workspace + 3 * 8, out3);
++  VEC_ST(workspace + 4 * 8, out4);
++  VEC_ST(workspace + 5 * 8, out5);
++  VEC_ST(workspace + 6 * 8, out6);
++  VEC_ST(workspace + 7 * 8, out7);
++}
++
++
++#define WORD_BIT  16
++#define MULTIPLY(vs0, vs1, out) out = _mm_mulhi_epu16(vs0, vs1)
++
++void jsimd_quantize_e2k(JCOEFPTR coef_block, DCTELEM *divisors,
++                        DCTELEM *workspace)
++{
++  __m128i row0, row1, row2, row3, row4, row5, row6, row7,
++    row0s, row1s, row2s, row3s, row4s, row5s, row6s, row7s,
++    corr0, corr1, corr2, corr3, corr4, corr5, corr6, corr7,
++    recip0, recip1, recip2, recip3, recip4, recip5, recip6, recip7,
++    scale0, scale1, scale2, scale3, scale4, scale5, scale6, scale7;
++
++  row0s = VEC_LD(workspace + 0 * 8);
++  row1s = VEC_LD(workspace + 1 * 8);
++  row2s = VEC_LD(workspace + 2 * 8);
++  row3s = VEC_LD(workspace + 3 * 8);
++  row4s = VEC_LD(workspace + 4 * 8);
++  row5s = VEC_LD(workspace + 5 * 8);
++  row6s = VEC_LD(workspace + 6 * 8);
++  row7s = VEC_LD(workspace + 7 * 8);
++  row0 = _mm_abs_epi16(row0s);
++  row1 = _mm_abs_epi16(row1s);
++  row2 = _mm_abs_epi16(row2s);
++  row3 = _mm_abs_epi16(row3s);
++  row4 = _mm_abs_epi16(row4s);
++  row5 = _mm_abs_epi16(row5s);
++  row6 = _mm_abs_epi16(row6s);
++  row7 = _mm_abs_epi16(row7s);
++
++  corr0 = VEC_LD(divisors + DCTSIZE2 + 0 * 8);
++  corr1 = VEC_LD(divisors + DCTSIZE2 + 1 * 8);
++  corr2 = VEC_LD(divisors + DCTSIZE2 + 2 * 8);
++  corr3 = VEC_LD(divisors + DCTSIZE2 + 3 * 8);
++  corr4 = VEC_LD(divisors + DCTSIZE2 + 4 * 8);
++  corr5 = VEC_LD(divisors + DCTSIZE2 + 5 * 8);
++  corr6 = VEC_LD(divisors + DCTSIZE2 + 6 * 8);
++  corr7 = VEC_LD(divisors + DCTSIZE2 + 7 * 8);
++
++  row0 = _mm_add_epi16(row0, corr0);
++  row1 = _mm_add_epi16(row1, corr1);
++  row2 = _mm_add_epi16(row2, corr2);
++  row3 = _mm_add_epi16(row3, corr3);
++  row4 = _mm_add_epi16(row4, corr4);
++  row5 = _mm_add_epi16(row5, corr5);
++  row6 = _mm_add_epi16(row6, corr6);
++  row7 = _mm_add_epi16(row7, corr7);
++
++  recip0 = VEC_LD(divisors + 0 * 8);
++  recip1 = VEC_LD(divisors + 1 * 8);
++  recip2 = VEC_LD(divisors + 2 * 8);
++  recip3 = VEC_LD(divisors + 3 * 8);
++  recip4 = VEC_LD(divisors + 4 * 8);
++  recip5 = VEC_LD(divisors + 5 * 8);
++  recip6 = VEC_LD(divisors + 6 * 8);
++  recip7 = VEC_LD(divisors + 7 * 8);
++
++  MULTIPLY(row0, recip0, row0);
++  MULTIPLY(row1, recip1, row1);
++  MULTIPLY(row2, recip2, row2);
++  MULTIPLY(row3, recip3, row3);
++  MULTIPLY(row4, recip4, row4);
++  MULTIPLY(row5, recip5, row5);
++  MULTIPLY(row6, recip6, row6);
++  MULTIPLY(row7, recip7, row7);
++
++  scale0 = VEC_LD(divisors + DCTSIZE2 * 2 + 0 * 8);
++  scale1 = VEC_LD(divisors + DCTSIZE2 * 2 + 1 * 8);
++  scale2 = VEC_LD(divisors + DCTSIZE2 * 2 + 2 * 8);
++  scale3 = VEC_LD(divisors + DCTSIZE2 * 2 + 3 * 8);
++  scale4 = VEC_LD(divisors + DCTSIZE2 * 2 + 4 * 8);
++  scale5 = VEC_LD(divisors + DCTSIZE2 * 2 + 5 * 8);
++  scale6 = VEC_LD(divisors + DCTSIZE2 * 2 + 6 * 8);
++  scale7 = VEC_LD(divisors + DCTSIZE2 * 2 + 7 * 8);
++
++  MULTIPLY(row0, scale0, row0);
++  MULTIPLY(row1, scale1, row1);
++  MULTIPLY(row2, scale2, row2);
++  MULTIPLY(row3, scale3, row3);
++  MULTIPLY(row4, scale4, row4);
++  MULTIPLY(row5, scale5, row5);
++  MULTIPLY(row6, scale6, row6);
++  MULTIPLY(row7, scale7, row7);
++
++  row0 = _mm_sign_epi16(row0, row0s);
++  row1 = _mm_sign_epi16(row1, row1s);
++  row2 = _mm_sign_epi16(row2, row2s);
++  row3 = _mm_sign_epi16(row3, row3s);
++  row4 = _mm_sign_epi16(row4, row4s);
++  row5 = _mm_sign_epi16(row5, row5s);
++  row6 = _mm_sign_epi16(row6, row6s);
++  row7 = _mm_sign_epi16(row7, row7s);
++
++  VEC_ST(coef_block + 0 * 8, row0);
++  VEC_ST(coef_block + 1 * 8, row1);
++  VEC_ST(coef_block + 2 * 8, row2);
++  VEC_ST(coef_block + 3 * 8, row3);
++  VEC_ST(coef_block + 4 * 8, row4);
++  VEC_ST(coef_block + 5 * 8, row5);
++  VEC_ST(coef_block + 6 * 8, row6);
++  VEC_ST(coef_block + 7 * 8, row7);
++}
+diff --git a/simd/e2k/jsimd.c b/simd/e2k/jsimd.c
+new file mode 100644
+index 0000000..f8c0465
+--- /dev/null
++++ b/simd/e2k/jsimd.c
+@@ -0,0 +1,761 @@
++/*
++ * jsimd_e2k.c
++ *
++ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
++ * Copyright (C) 2009-2011, 2014-2016, 2018, D. R. Commander.
++ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
++ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
++ *
++ * Based on the x86 SIMD extension for IJG JPEG library,
++ * Copyright (C) 1999-2006, MIYASAKA Masaru.
++ * For conditions of distribution and use, see copyright notice in jsimdext.inc
++ *
++ * This file contains the interface between the "normal" portions
++ * of the library and the SIMD implementations when running on a
++ * PowerPC architecture.
++ */
++
++#define JPEG_INTERNALS
++#include "../../jinclude.h"
++#include "../../jpeglib.h"
++#include "../../jsimd.h"
++#include "../../jdct.h"
++#include "../../jsimddct.h"
++#include "../jsimd.h"
++#include "jsimd_api_e2k.h"
++
++static unsigned int simd_support = ~0;
++static unsigned int simd_huffman = 1;
++
++/*
++ * Check what SIMD accelerations are supported.
++ *
++ * FIXME: This code is racy under a multi-threaded environment.
++ */
++LOCAL(void)
++init_simd(void)
++{
++#ifndef NO_GETENV
++  char *env = NULL;
++#endif
++
++  if (simd_support != ~0U)
++    return;
++
++  simd_support = JSIMD_SSE2;
++
++#ifndef NO_GETENV
++  /* Force different settings through environment variables */
++  env = getenv("JSIMD_FORCENONE");
++  if ((env != NULL) && (strcmp(env, "1") == 0))
++    simd_support = 0;
++  env = getenv("JSIMD_NOHUFFENC");
++  if ((env != NULL) && (strcmp(env, "1") == 0))
++    simd_huffman = 0;
++#endif
++}
++
++static inline int color_space_idx(J_COLOR_SPACE color_space) {
++  switch (color_space) {
++  case JCS_EXT_RGB:
++    return 1 + (EXT_RGB_PIXELSIZE != 3) * 16;
++  case JCS_EXT_RGBX:
++  case JCS_EXT_RGBA:
++    return 2 + (EXT_RGBX_PIXELSIZE != 3) * 16;
++  case JCS_EXT_BGR:
++    return 3 + (EXT_BGR_PIXELSIZE != 3) * 16;
++  case JCS_EXT_BGRX:
++  case JCS_EXT_BGRA:
++    return 4 + (EXT_BGRX_PIXELSIZE != 3) * 16;
++  case JCS_EXT_XBGR:
++  case JCS_EXT_ABGR:
++    return 5 + (EXT_XBGR_PIXELSIZE != 3) * 16;
++  case JCS_EXT_XRGB:
++  case JCS_EXT_ARGB:
++    return 6 + (EXT_XRGB_PIXELSIZE != 3) * 16;
++  default:
++    break;
++  }
++  return 0 + (RGB_PIXELSIZE != 3) * 16;
++}
++
++GLOBAL(int)
++jsimd_can_rgb_ycc(void)
++{
++  init_simd();
++
++  /* The code is optimised for these values only */
++  if (BITS_IN_JSAMPLE != 8)
++    return 0;
++  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
++    return 0;
++
++  if (simd_support & JSIMD_SSE2)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(int)
++jsimd_can_rgb_gray(void)
++{
++  init_simd();
++
++  /* The code is optimised for these values only */
++  if (BITS_IN_JSAMPLE != 8)
++    return 0;
++  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
++    return 0;
++
++  if (simd_support & JSIMD_SSE2)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(int)
++jsimd_can_ycc_rgb(void)
++{
++  init_simd();
++
++  /* The code is optimised for these values only */
++  if (BITS_IN_JSAMPLE != 8)
++    return 0;
++  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
++    return 0;
++
++  if (simd_support & JSIMD_SSE2)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(int)
++jsimd_can_ycc_rgb565(void)
++{
++  return 0;
++}
++
++GLOBAL(void)
++jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
++                      JSAMPIMAGE output_buf, JDIMENSION output_row,
++                      int num_rows)
++{
++  void (*e2kfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int, int);
++  int idx = color_space_idx(cinfo->in_color_space);
++
++  e2kfct = idx < 16 ? jsimd_rgb3_ycc_convert_e2k :
++                      jsimd_rgb4_ycc_convert_e2k;
++  idx &= 15;
++
++  e2kfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows, idx);
++}
++
++GLOBAL(void)
++jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
++                       JSAMPIMAGE output_buf, JDIMENSION output_row,
++                       int num_rows)
++{
++  void (*e2kfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int, int);
++  int idx = color_space_idx(cinfo->in_color_space);
++
++  e2kfct = idx < 16 ? jsimd_rgb3_gray_convert_e2k :
++                      jsimd_rgb4_gray_convert_e2k;
++  idx &= 15;
++
++  e2kfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows, idx);
++}
++
++GLOBAL(void)
++jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
++                      JDIMENSION input_row, JSAMPARRAY output_buf,
++                      int num_rows)
++{
++  void (*e2kfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int, int);
++  int idx = color_space_idx(cinfo->out_color_space);
++
++  e2kfct = idx < 16 ? jsimd_ycc_rgb3_convert_e2k :
++                      jsimd_ycc_rgb4_convert_e2k;
++  idx &= 15;
++
++  e2kfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows, idx);
++}
++
++GLOBAL(void)
++jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
++                         JDIMENSION input_row, JSAMPARRAY output_buf,
++                         int num_rows)
++{
++}
++
++GLOBAL(int)
++jsimd_can_h2v2_downsample(void)
++{
++  init_simd();
++
++  /* The code is optimised for these values only */
++  if (BITS_IN_JSAMPLE != 8)
++    return 0;
++
++  if (simd_support & JSIMD_SSE2)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(int)
++jsimd_can_h2v1_downsample(void)
++{
++  init_simd();
++
++  /* The code is optimised for these values only */
++  if (BITS_IN_JSAMPLE != 8)
++    return 0;
++
++  if (simd_support & JSIMD_SSE2)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(void)
++jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
++                      JSAMPARRAY input_data, JSAMPARRAY output_data)
++{
++  jsimd_h2v2_downsample_e2k(cinfo->image_width, cinfo->max_v_samp_factor,
++                                compptr->v_samp_factor,
++                                compptr->width_in_blocks, input_data,
++                                output_data);
++}
++
++GLOBAL(void)
++jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
++                      JSAMPARRAY input_data, JSAMPARRAY output_data)
++{
++  jsimd_h2v1_downsample_e2k(cinfo->image_width, cinfo->max_v_samp_factor,
++                                compptr->v_samp_factor,
++                                compptr->width_in_blocks, input_data,
++                                output_data);
++}
++
++GLOBAL(int)
++jsimd_can_h2v2_upsample(void)
++{
++  init_simd();
++
++  /* The code is optimised for these values only */
++  if (BITS_IN_JSAMPLE != 8)
++    return 0;
++
++  if (simd_support & JSIMD_SSE2)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(int)
++jsimd_can_h2v1_upsample(void)
++{
++  init_simd();
++
++  /* The code is optimised for these values only */
++  if (BITS_IN_JSAMPLE != 8)
++    return 0;
++
++  if (simd_support & JSIMD_SSE2)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(void)
++jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
++                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
++{
++  jsimd_h2v2_upsample_e2k(cinfo->max_v_samp_factor, cinfo->output_width,
++                          input_data, output_data_ptr);
++}
++
++GLOBAL(void)
++jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
++                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
++{
++  jsimd_h2v1_upsample_e2k(cinfo->max_v_samp_factor, cinfo->output_width,
++                          input_data, output_data_ptr);
++}
++
++GLOBAL(int)
++jsimd_can_h2v2_fancy_upsample(void)
++{
++  init_simd();
++
++  /* The code is optimised for these values only */
++  if (BITS_IN_JSAMPLE != 8)
++    return 0;
++
++  if (simd_support & JSIMD_SSE2)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(int)
++jsimd_can_h2v1_fancy_upsample(void)
++{
++  init_simd();
++
++  /* The code is optimised for these values only */
++  if (BITS_IN_JSAMPLE != 8)
++    return 0;
++
++  if (simd_support & JSIMD_SSE2)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(void)
++jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
++                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
++{
++  jsimd_h2v2_fancy_upsample_e2k(cinfo->max_v_samp_factor,
++                                compptr->downsampled_width, input_data,
++                                output_data_ptr);
++}
++
++GLOBAL(void)
++jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
++                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
++{
++  jsimd_h2v1_fancy_upsample_e2k(cinfo->max_v_samp_factor,
++                                compptr->downsampled_width, input_data,
++                                output_data_ptr);
++}
++
++GLOBAL(int)
++jsimd_can_h2v2_merged_upsample(void)
++{
++  init_simd();
++
++  /* The code is optimised for these values only */
++  if (BITS_IN_JSAMPLE != 8)
++    return 0;
++
++  if (simd_support & JSIMD_SSE2)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(int)
++jsimd_can_h2v1_merged_upsample(void)
++{
++  init_simd();
++
++  /* The code is optimised for these values only */
++  if (BITS_IN_JSAMPLE != 8)
++    return 0;
++
++  if (simd_support & JSIMD_SSE2)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(void)
++jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
++                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
++{
++  void (*e2kfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JDIMENSION, JSAMPARRAY, int);
++  int idx = color_space_idx(cinfo->out_color_space);
++
++  e2kfct = idx < 16 ? jsimd_ycc_rgb3_merged_upsample_e2k :
++                      jsimd_ycc_rgb4_merged_upsample_e2k;
++  idx &= 15;
++
++  e2kfct(cinfo->output_width, input_buf, in_row_group_ctr,
++         in_row_group_ctr * 2, output_buf, idx);
++  e2kfct(cinfo->output_width, input_buf, in_row_group_ctr,
++         in_row_group_ctr * 2 + 1, output_buf + 1, idx);
++}
++
++GLOBAL(void)
++jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
++                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
++{
++  void (*e2kfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JDIMENSION, JSAMPARRAY, int);
++  int idx = color_space_idx(cinfo->out_color_space);
++
++  e2kfct = idx < 16 ? jsimd_ycc_rgb3_merged_upsample_e2k :
++                      jsimd_ycc_rgb4_merged_upsample_e2k;
++  idx &= 15;
++
++  e2kfct(cinfo->output_width, input_buf, in_row_group_ctr,
++         in_row_group_ctr, output_buf, idx);
++}
++
++GLOBAL(int)
++jsimd_can_convsamp(void)
++{
++  init_simd();
++
++  /* The code is optimised for these values only */
++  if (DCTSIZE != 8)
++    return 0;
++  if (BITS_IN_JSAMPLE != 8)
++    return 0;
++  if (sizeof(DCTELEM) != 2)
++    return 0;
++
++  if (simd_support & JSIMD_SSE2)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(int)
++jsimd_can_convsamp_float(void)
++{
++  init_simd();
++
++  /* The code is optimised for these values only */
++  if (DCTSIZE != 8)
++    return 0;
++  if (BITS_IN_JSAMPLE != 8)
++    return 0;
++  if (sizeof(FAST_FLOAT) != 4)
++    return 0;
++
++  if (simd_support & JSIMD_SSE2)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(void)
++jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
++               DCTELEM *workspace)
++{
++  jsimd_convsamp_e2k(sample_data, start_col, workspace);
++}
++
++GLOBAL(void)
++jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
++                     FAST_FLOAT *workspace)
++{
++  jsimd_convsamp_float_e2k(sample_data, start_col, workspace);
++}
++
++GLOBAL(int)
++jsimd_can_fdct_islow(void)
++{
++  init_simd();
++
++  /* The code is optimised for these values only */
++  if (DCTSIZE != 8)
++    return 0;
++  if (sizeof(DCTELEM) != 2)
++    return 0;
++
++  if (simd_support & JSIMD_SSE2)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(int)
++jsimd_can_fdct_ifast(void)
++{
++  init_simd();
++
++  /* The code is optimised for these values only */
++  if (DCTSIZE != 8)
++    return 0;
++  if (sizeof(DCTELEM) != 2)
++    return 0;
++
++  if (simd_support & JSIMD_SSE2)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(int)
++jsimd_can_fdct_float(void)
++{
++  init_simd();
++
++  /* The code is optimised for these values only */
++  if (DCTSIZE != 8)
++    return 0;
++  if (sizeof(FAST_FLOAT) != 4)
++    return 0;
++
++  if (simd_support & JSIMD_SSE2)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(void)
++jsimd_fdct_islow(DCTELEM *data)
++{
++  jsimd_fdct_islow_e2k(data);
++}
++
++GLOBAL(void)
++jsimd_fdct_ifast(DCTELEM *data)
++{
++  jsimd_fdct_ifast_e2k(data);
++}
++
++GLOBAL(void)
++jsimd_fdct_float(FAST_FLOAT *data)
++{
++  jsimd_fdct_float_e2k(data);
++}
++
++GLOBAL(int)
++jsimd_can_quantize(void)
++{
++  init_simd();
++
++  /* The code is optimised for these values only */
++  if (DCTSIZE != 8)
++    return 0;
++  if (sizeof(JCOEF) != 2)
++    return 0;
++  if (sizeof(DCTELEM) != 2)
++    return 0;
++
++  if (simd_support & JSIMD_SSE2)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(int)
++jsimd_can_quantize_float(void)
++{
++  init_simd();
++
++  /* The code is optimised for these values only */
++  if (DCTSIZE != 8)
++    return 0;
++  if (sizeof(JCOEF) != 2)
++    return 0;
++  if (sizeof(FAST_FLOAT) != 4)
++    return 0;
++
++  if (simd_support & JSIMD_SSE2)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(void)
++jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
++{
++  jsimd_quantize_e2k(coef_block, divisors, workspace);
++}
++
++GLOBAL(void)
++jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
++                     FAST_FLOAT *workspace)
++{
++  jsimd_quantize_float_e2k(coef_block, divisors, workspace);
++}
++
++GLOBAL(int)
++jsimd_can_idct_2x2(void)
++{
++  return 0;
++}
++
++GLOBAL(int)
++jsimd_can_idct_4x4(void)
++{
++  return 0;
++}
++
++GLOBAL(void)
++jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
++               JCOEFPTR coef_block, JSAMPARRAY output_buf,
++               JDIMENSION output_col)
++{
++}
++
++GLOBAL(void)
++jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
++               JCOEFPTR coef_block, JSAMPARRAY output_buf,
++               JDIMENSION output_col)
++{
++}
++
++GLOBAL(int)
++jsimd_can_idct_islow(void)
++{
++  init_simd();
++
++  /* The code is optimised for these values only */
++  if (DCTSIZE != 8)
++    return 0;
++  if (sizeof(JCOEF) != 2)
++    return 0;
++  if (BITS_IN_JSAMPLE != 8)
++    return 0;
++  if (sizeof(ISLOW_MULT_TYPE) != 2)
++    return 0;
++
++  if (simd_support & JSIMD_SSE2)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(int)
++jsimd_can_idct_ifast(void)
++{
++  init_simd();
++
++  /* The code is optimised for these values only */
++  if (DCTSIZE != 8)
++    return 0;
++  if (sizeof(JCOEF) != 2)
++    return 0;
++  if (BITS_IN_JSAMPLE != 8)
++    return 0;
++  if (sizeof(IFAST_MULT_TYPE) != 2)
++    return 0;
++  if (IFAST_SCALE_BITS != 2)
++    return 0;
++
++  if (simd_support & JSIMD_SSE2)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(int)
++jsimd_can_idct_float(void)
++{
++  init_simd();
++
++  /* The code is optimised for these values only */
++  if (DCTSIZE != 8)
++    return 0;
++  if (sizeof(JCOEF) != 2)
++    return 0;
++  if (BITS_IN_JSAMPLE != 8)
++    return 0;
++  if (sizeof(FAST_FLOAT) != 4)
++    return 0;
++  if (sizeof(FLOAT_MULT_TYPE) != 4)
++    return 0;
++
++  if (simd_support & JSIMD_SSE2)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(void)
++jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
++                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
++                 JDIMENSION output_col)
++{
++  jsimd_idct_islow_e2k(compptr->dct_table, coef_block, output_buf,
++                       output_col);
++}
++
++GLOBAL(void)
++jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
++                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
++                 JDIMENSION output_col)
++{
++  jsimd_idct_ifast_e2k(compptr->dct_table, coef_block, output_buf,
++                       output_col);
++}
++
++GLOBAL(void)
++jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
++                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
++                 JDIMENSION output_col)
++{
++  jsimd_idct_float_e2k(compptr->dct_table, coef_block, output_buf,
++                       output_col);
++}
++
++GLOBAL(int)
++jsimd_can_huff_encode_one_block(void)
++{
++  init_simd();
++
++  if (DCTSIZE != 8)
++    return 0;
++  if (sizeof(JCOEF) != 2)
++    return 0;
++
++  if ((simd_support & JSIMD_SSE2) && simd_huffman)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(JOCTET *)
++jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
++                            int last_dc_val, c_derived_tbl *dctbl,
++                            c_derived_tbl *actbl)
++{
++  return jsimd_huff_encode_one_block_e2k(state, buffer, block, last_dc_val,
++                                         dctbl, actbl);
++}
++
++GLOBAL(int)
++jsimd_can_encode_mcu_AC_first_prepare(void)
++{
++  init_simd();
++
++  if (DCTSIZE != 8)
++    return 0;
++  if (sizeof(JCOEF) != 2)
++    return 0;
++  if ((simd_support & JSIMD_SSE2) && simd_huffman)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(void)
++jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
++                                  const int *jpeg_natural_order_start, int Sl,
++                                  int Al, UJCOEF *values, size_t *zerobits)
++{
++  jsimd_encode_mcu_AC_first_prepare_e2k(block, jpeg_natural_order_start,
++                                        Sl, Al, (JCOEF*)values, zerobits);
++}
++
++GLOBAL(int)
++jsimd_can_encode_mcu_AC_refine_prepare(void)
++{
++  init_simd();
++
++  if (DCTSIZE != 8)
++    return 0;
++  if (sizeof(JCOEF) != 2)
++    return 0;
++  if ((simd_support & JSIMD_SSE2) && simd_huffman)
++    return 1;
++
++  return 0;
++}
++
++GLOBAL(int)
++jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
++                                   const int *jpeg_natural_order_start, int Sl,
++                                   int Al, UJCOEF *absvalues, size_t *bits)
++{
++  return jsimd_encode_mcu_AC_refine_prepare_e2k(block,
++                                                jpeg_natural_order_start,
++                                                Sl, Al, (JCOEF*)absvalues, bits);
++}
+diff --git a/simd/e2k/jsimd_api_e2k.h b/simd/e2k/jsimd_api_e2k.h
+new file mode 100644
+index 0000000..d857203
+--- /dev/null
++++ b/simd/e2k/jsimd_api_e2k.h
+@@ -0,0 +1,94 @@
++/*
++ * Elbrus optimizations for libjpeg-turbo
++ *
++ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
++ *
++ * This software is provided 'as-is', without any express or implied
++ * warranty.  In no event will the authors be held liable for any damages
++ * arising from the use of this software.
++ *
++ * Permission is granted to anyone to use this software for any purpose,
++ * including commercial applications, and to alter it and redistribute it
++ * freely, subject to the following restrictions:
++ *
++ * 1. The origin of this software must not be misrepresented; you must not
++ *    claim that you wrote the original software. If you use this software
++ *    in a product, an acknowledgment in the product documentation would be
++ *    appreciated but is not required.
++ * 2. Altered source versions must be plainly marked as such, and must not be
++ *    misrepresented as being the original software.
++ * 3. This notice may not be removed or altered from any source distribution.
++ */
++
++/* Function declarations */
++
++#define CONVERT_DECL(n) \
++EXTERN(void) jsimd_rgb##n##_ycc_convert_e2k(JDIMENSION img_width, \
++    JSAMPARRAY input_buf, JSAMPIMAGE output_buf, \
++    JDIMENSION output_row, int num_rows, int shuf_idx); \
++EXTERN(void) jsimd_rgb##n##_gray_convert_e2k(JDIMENSION img_width, \
++    JSAMPARRAY input_buf, JSAMPIMAGE output_buf, \
++    JDIMENSION output_row, int num_rows, int shuf_idx); \
++EXTERN(void) jsimd_ycc_rgb##n##_convert_e2k(JDIMENSION out_width, \
++    JSAMPIMAGE input_buf, JDIMENSION input_row, \
++    JSAMPARRAY output_buf, int num_rows, int shuf_idx); \
++EXTERN(void) jsimd_ycc_rgb##n##_convert_e2k(JDIMENSION out_width, \
++    JSAMPIMAGE input_buf, JDIMENSION input_row, JSAMPARRAY output_buf, \
++    int num_rows, int shuf_idx); \
++EXTERN(void) jsimd_ycc_rgb##n##_merged_upsample_e2k(JDIMENSION out_width, \
++    JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, \
++    JDIMENSION in_row_group_ctr_y, JSAMPARRAY output_buf, int shuf_idx); \
++
++CONVERT_DECL(3)
++CONVERT_DECL(4)
++
++EXTERN(void) jsimd_h2v1_downsample_e2k
++  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
++   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
++EXTERN(void) jsimd_h2v2_downsample_e2k
++  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
++   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
++
++#define UPSAMPLE_DECL(name) \
++EXTERN(void) jsimd_##name##_upsample_e2k \
++  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, \
++   JSAMPARRAY *output_data_ptr);
++
++UPSAMPLE_DECL(h2v1)
++UPSAMPLE_DECL(h2v2)
++UPSAMPLE_DECL(h2v1_fancy)
++UPSAMPLE_DECL(h2v2_fancy)
++
++EXTERN(void) jsimd_convsamp_e2k
++  (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
++EXTERN(void) jsimd_convsamp_float_e2k
++  (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
++
++EXTERN(void) jsimd_fdct_islow_e2k(DCTELEM *data);
++EXTERN(void) jsimd_fdct_ifast_e2k(DCTELEM *data);
++EXTERN(void) jsimd_fdct_float_e2k(FAST_FLOAT *data);
++EXTERN(void) jsimd_quantize_e2k
++  (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
++EXTERN(void) jsimd_quantize_float_e2k
++  (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
++EXTERN(void) jsimd_idct_islow_e2k
++  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
++   JDIMENSION output_col);
++EXTERN(void) jsimd_idct_ifast_e2k
++  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
++   JDIMENSION output_col);
++EXTERN(void) jsimd_idct_float_e2k
++  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
++   JDIMENSION output_col);
++
++EXTERN(JOCTET *) jsimd_huff_encode_one_block_e2k
++  (void *state, JOCTET *buffer, JCOEFPTR block,
++   int last_dc_val, c_derived_tbl *dctbl, c_derived_tbl *actbl);
++
++EXTERN(void) jsimd_encode_mcu_AC_first_prepare_e2k
++  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
++   JCOEF *values, size_t *zerobits);
++
++EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_e2k
++  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
++   JCOEF *absvalues, size_t *bits);
+diff --git a/simd/e2k/jsimd_e2k.h b/simd/e2k/jsimd_e2k.h
+new file mode 100644
+index 0000000..15d6262
+--- /dev/null
++++ b/simd/e2k/jsimd_e2k.h
+@@ -0,0 +1,207 @@
++/*
++ * Elbrus optimizations for libjpeg-turbo
++ *
++ * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
++ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
++ *
++ * This software is provided 'as-is', without any express or implied
++ * warranty.  In no event will the authors be held liable for any damages
++ * arising from the use of this software.
++ *
++ * Permission is granted to anyone to use this software for any purpose,
++ * including commercial applications, and to alter it and redistribute it
++ * freely, subject to the following restrictions:
++ *
++ * 1. The origin of this software must not be misrepresented; you must not
++ *    claim that you wrote the original software. If you use this software
++ *    in a product, an acknowledgment in the product documentation would be
++ *    appreciated but is not required.
++ * 2. Altered source versions must be plainly marked as such, and must not be
++ *    misrepresented as being the original software.
++ * 3. This notice may not be removed or altered from any source distribution.
++ */
++
++#define JPEG_INTERNALS
++#include "../../jinclude.h"
++#include "../../jpeglib.h"
++#include "../../jsimd.h"
++#include "../../jdct.h"
++#include "../../jsimddct.h"
++#include "../jsimd.h"
++#include "jsimd_api_e2k.h"
++#include <stdint.h>
++#include <smmintrin.h> /* SSE4.1 */
++
++
++/* Common code */
++
++#define __4X2(a, b)  a, b, a, b, a, b, a, b
++#define ALWAYS_INLINE __attribute__((__always_inline__)) inline
++
++#ifdef __e2k__
++#define PRAGMA_E2K _Pragma
++#define _mm_shuffle2_pi8(a, b, c) \
++    ((__m64)__builtin_e2k_pshufb((uint64_t)(b), (uint64_t)(a), (uint64_t)(c)))
++#define _mm_shuffle2_epi8(a, b, c) \
++    ((__m128i)__builtin_e2k_qppermb((__v2di)(b), (__v2di)(a), (__v2di)(c)))
++#define _mm_blendv_pi8(a, b, c) \
++    ((__m64)__builtin_e2k_pmerge((uint64_t)(a), (uint64_t)(b), (uint64_t)(c)))
++#else
++#define PRAGMA_E2K(x)
++#define _mm_shuffle2_pi8(a, b, c) \
++    _mm_movepi64_pi64(_mm_shuffle_epi8(_mm_unpacklo_epi64( \
++        _mm_movpi64_epi64(a), _mm_movpi64_epi64(b)), _mm_movpi64_epi64(c)))
++#define _mm_shuffle2_epi8(a, b, c) \
++    _mm_blendv_epi8(_mm_shuffle_epi8(a, c), _mm_shuffle_epi8(b, c), \
++        _mm_slli_epi16(c, 3))
++#define _mm_blendv_pi8(a, b, c) \
++    _mm_movepi64_pi64(_mm_blendv_epi8(_mm_movpi64_epi64(a), \
++        _mm_movpi64_epi64(b), _mm_movpi64_epi64(c)))
++
++#define BITREV_ROUND(c, i) a = (a & c) << i | (a >> i & c);
++static ALWAYS_INLINE uint64_t __builtin_e2k_bitrevd(uint64_t a) {
++    BITREV_ROUND(0x5555555555555555ll, 1)
++    BITREV_ROUND(0x3333333333333333ll, 2)
++    BITREV_ROUND(0x0F0F0F0F0F0F0F0Fll, 4)
++    BITREV_ROUND(0x00FF00FF00FF00FFll, 8)
++    BITREV_ROUND(0x0000FFFF0000FFFFll, 16)
++    return a >> 32 | a << 32;
++}
++
++static ALWAYS_INLINE uint64_t __builtin_e2k_insfd(uint64_t a, uint64_t b, uint64_t c) {
++  int n = b & 63;
++  a = a >> n | a << (64 - n);
++  return c ^ ((a ^ c) & (~0ll << (b >> 6 & 63)));
++}
++#endif
++
++#if defined(__iset__) && __iset__ >= 5
++static ALWAYS_INLINE __m128i _mm_packhi_epi32(__m128i a, __m128i b) {
++  __m128i index = _mm_setr_epi8(
++    2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31);
++  return _mm_shuffle2_epi8(a, b, index);
++}
++
++#define VEC_ISZERO(a) !_mm_cvtsi128_si64(_mm_packs_epi16(a, a))
++#else
++static ALWAYS_INLINE __m128i _mm_packhi_epi32(__m128i a, __m128i b) {
++  union { __m128i v; __m64 d[2]; } l = { a }, h = { b }, x;
++  __m64 index = _mm_setr_pi8(2, 3, 6, 7, 10, 11, 14, 15);
++  x.d[0] = _mm_shuffle2_pi8(l.d[0], l.d[1], index);
++  x.d[1] = _mm_shuffle2_pi8(h.d[0], h.d[1], index);
++  return x.v;
++}
++
++static ALWAYS_INLINE uint64_t vec_isnonzero(__m128i a) {
++  __v2di x = (__v2di)a;
++  return x[0] | x[1];
++}
++
++#define VEC_ISZERO(a) !vec_isnonzero(a)
++#endif
++
++#define VEC_ALIGNR8(a, b) _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b), _mm_castsi128_pd(a), 1))
++
++#define TRANSPOSE_FLOAT(a, b, c, d, e, f, g, h) \
++  tmp0 = _mm_unpacklo_ps(a, b); \
++  tmp1 = _mm_unpackhi_ps(a, b); \
++  tmp2 = _mm_unpacklo_ps(c, d); \
++  tmp3 = _mm_unpackhi_ps(c, d); \
++  e = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); \
++  f = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); \
++  g = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); \
++  h = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
++
++#define TRANSPOSE8(a, b) \
++  b##0 = _mm_unpacklo_epi8(a##0, a##2); \
++  b##1 = _mm_unpackhi_epi8(a##0, a##2); \
++  b##2 = _mm_unpacklo_epi8(a##1, a##3); \
++  b##3 = _mm_unpackhi_epi8(a##1, a##3);
++
++#define TRANSPOSE16(a, b) \
++  b##0 = _mm_unpacklo_epi16(a##0, a##2); \
++  b##1 = _mm_unpackhi_epi16(a##0, a##2); \
++  b##2 = _mm_unpacklo_epi16(a##1, a##3); \
++  b##3 = _mm_unpackhi_epi16(a##1, a##3); \
++  b##4 = _mm_unpacklo_epi16(a##4, a##6); \
++  b##5 = _mm_unpackhi_epi16(a##4, a##6); \
++  b##6 = _mm_unpacklo_epi16(a##5, a##7); \
++  b##7 = _mm_unpackhi_epi16(a##5, a##7);
++
++#define TRANSPOSE(a, b) \
++  TRANSPOSE16(a, b) TRANSPOSE16(b, a) \
++  b##0 = _mm_unpacklo_epi64(a##0, a##4); \
++  b##1 = _mm_unpackhi_epi64(a##0, a##4); \
++  b##2 = _mm_unpacklo_epi64(a##1, a##5); \
++  b##3 = _mm_unpackhi_epi64(a##1, a##5); \
++  b##4 = _mm_unpacklo_epi64(a##2, a##6); \
++  b##5 = _mm_unpackhi_epi64(a##2, a##6); \
++  b##6 = _mm_unpacklo_epi64(a##3, a##7); \
++  b##7 = _mm_unpackhi_epi64(a##3, a##7);
++
++#define IDCT_SAVE() { \
++  __m128i pb_cj = _mm_set1_epi8((int8_t)CENTERJSAMPLE); \
++  \
++  row0 = _mm_xor_si128(_mm_packs_epi16(out0, out1), pb_cj); \
++  row1 = _mm_xor_si128(_mm_packs_epi16(out2, out3), pb_cj); \
++  row2 = _mm_xor_si128(_mm_packs_epi16(out4, out5), pb_cj); \
++  row3 = _mm_xor_si128(_mm_packs_epi16(out6, out7), pb_cj); \
++  \
++  TRANSPOSE8(row, col) TRANSPOSE8(col, row) TRANSPOSE8(row, col) \
++  \
++  VEC_STL(output_buf[0] + output_col, col0); \
++  VEC_STH(output_buf[1] + output_col, col0); \
++  VEC_STL(output_buf[2] + output_col, col1); \
++  VEC_STH(output_buf[3] + output_col, col1); \
++  VEC_STL(output_buf[4] + output_col, col2); \
++  VEC_STH(output_buf[5] + output_col, col2); \
++  VEC_STL(output_buf[6] + output_col, col3); \
++  VEC_STH(output_buf[7] + output_col, col3); \
++}
++
++#define IDCT_SPLAT8(col0) { \
++  row3 = _mm_unpacklo_epi16(col0, col0); \
++  row7 = _mm_unpackhi_epi16(col0, col0); \
++  row1 = _mm_unpacklo_epi16(row3, row3); \
++  row3 = _mm_unpackhi_epi16(row3, row3); \
++  row5 = _mm_unpacklo_epi16(row7, row7); \
++  row7 = _mm_unpackhi_epi16(row7, row7); \
++  row0 = _mm_unpacklo_epi64(row1, row1); \
++  row1 = _mm_unpackhi_epi64(row1, row1); \
++  row2 = _mm_unpacklo_epi64(row3, row3); \
++  row3 = _mm_unpackhi_epi64(row3, row3); \
++  row4 = _mm_unpacklo_epi64(row5, row5); \
++  row5 = _mm_unpackhi_epi64(row5, row5); \
++  row6 = _mm_unpacklo_epi64(row7, row7); \
++  row7 = _mm_unpackhi_epi64(row7, row7); \
++}
++
++#ifndef min
++#define min(a, b)  ((a) < (b) ? (a) : (b))
++#endif
++
++#define VEC_LD(a)     _mm_loadu_si128((const __m128i*)(a))
++#define VEC_ST(a, b)  _mm_storeu_si128((__m128i*)(a), b)
++#define VEC_LD8(a)    _mm_loadl_epi64((const __m128i*)(a))
++#define VEC_STL(a, b) _mm_storel_epi64((__m128i*)(a), b)
++#define VEC_STH(a, b) _mm_storeh_pd((double*)(a), _mm_castsi128_pd(b));
++#define VEC_SPLAT(v, i) _mm_shuffle_epi8(v, _mm_set1_epi16((i) * 2 | ((i) * 2 + 1) << 8))
++
++#if !defined(__iset__) || __iset__ < 5
++#define NEED_ALIGN8
++#define ALIGN8_COMMON uint64_t src_shr; __m64 src_tmp0, src_tmp1;
++#define ALIGN8_VARS(src) __m64 *src##_ptr, src##_next, src##_index;
++#define ALIGN8_START(ptr, src) \
++  src_shr = (intptr_t)(ptr - 1) & 7; \
++  src##_ptr = (__m64*)((intptr_t)(ptr - 1) & -8); \
++  src##_next = src##_ptr[src_shr == 7]; \
++  src##_index = _mm_add_pi8(_mm_set1_pi8(src_shr), \
++                            _mm_setr_pi8(1, 2, 3, 4, 5, 6, 7, 8));
++#define ALIGN8_READ16(v0, src, i) \
++  src_tmp1 = src##_ptr[i * 2 + 1]; \
++  src_tmp0 = _mm_shuffle2_pi8(src##_next, src_tmp1, src##_index); \
++  src##_next = src##_ptr[i * 2 + 2]; \
++  src_tmp1 = _mm_shuffle2_pi8(src_tmp1, src##_next, src##_index); \
++  v0 = _mm_setr_epi64(src_tmp0, src_tmp1);
++#endif
++
+-- 
+2.45.0
+

From a60a5fbd8c5a917d250db1cd70077235a38aa48b Mon Sep 17 00:00:00 2001
From: Mikhail Novosyolov <m.novosyolov@rosalinux.ru>
Date: Sun, 28 Jul 2024 13:35:14 +0300
Subject: [PATCH 30/32] better url

---
 .abf.yml           | 2 +-
 libjpeg-turbo.spec | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.abf.yml b/.abf.yml
index 3b105b8..9ac1e2d 100644
--- a/.abf.yml
+++ b/.abf.yml
@@ -1,2 +1,2 @@
 sources:
-  3.0.3.tar.gz: 397a31222105129c9e798efce2459c445048546e
+  libjpeg-turbo-3.0.3.tar.gz: 397a31222105129c9e798efce2459c445048546e
diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec
index ebcd6c1..57137d1 100644
--- a/libjpeg-turbo.spec
+++ b/libjpeg-turbo.spec
@@ -13,11 +13,11 @@ Summary:	A MMX/SSE2 accelerated library for manipulating JPEG image files
 Name:		libjpeg-turbo
 Epoch:		1
 Version:	3.0.3
-Release:	1
+Release:	2
 License:	wxWindows Library License
 Group:		Graphics
 Url:		http://www.libjpeg-turbo.org
-Source0:	https://github.com/libjpeg-turbo/libjpeg-turbo/archive/refs/tags/%{version}.tar.gz
+Source0:	https://github.com/libjpeg-turbo/libjpeg-turbo/archive/refs/tags/%{version}.tar.gz?/%{name}-%{version}.tar.gz
 # These two allow automatic lossless rotation of JPEG images from a digital
 # camera which have orientation markings in the EXIF data. After rotation
 # the orientation markings are reset to avoid duplicate rotation when

From 51d9a93eb1cc714986d39185d21da25c2948dca1 Mon Sep 17 00:00:00 2001
From: Mikhail Novosyolov <m.novosyolov@rosalinux.ru>
Date: Sun, 28 Jul 2024 13:38:19 +0300
Subject: [PATCH 31/32] clean up metadata

---
 libjpeg-turbo.spec | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec
index 57137d1..4cc0c51 100644
--- a/libjpeg-turbo.spec
+++ b/libjpeg-turbo.spec
@@ -134,8 +134,8 @@ library.
 %package -n jpeg-progs
 Summary:	Programs for manipulating JPEG format image files
 Group:		Graphics
-%rename		libjpeg-progs
-%rename		jpeg6-progs
+Provides:	libjpeg-progs = %{EVRD}
+Provides:	jpeg6-progs = %{EVRD}
 
 %description -n jpeg-progs
 This package contains simple client programs for accessing the

From f22ee2c14cd35b0de1815195746ed7dc9c7cd917 Mon Sep 17 00:00:00 2001
From: Mikhail Novosyolov <m.novosyolov@rosalinux.ru>
Date: Sun, 28 Jul 2024 13:44:03 +0300
Subject: [PATCH 32/32] compat with Fedora/RH

---
 libjpeg-turbo.spec | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/libjpeg-turbo.spec b/libjpeg-turbo.spec
index 4cc0c51..a549330 100644
--- a/libjpeg-turbo.spec
+++ b/libjpeg-turbo.spec
@@ -32,13 +32,23 @@ BuildRequires:	libtool >= 1.4
 BuildRequires:	nasm
 %endif
 
+# meta package
+Recommends:	%{libname} = %{EVRD}
+Recommends:	%{libturbo} = %{EVRD}
+Recommends:	%{libname62} = %{EVRD}
+
 %description
-This package contains a library of functions for manipulating JPEG images.
+This meta package pulls packages with libraries of functions for manipulating JPEG images.
 It is a high-speed, libjpeg-compatible version for x86 and x86-64
 processors which uses SIMD instructions (MMX, SSE2, etc.) to accelerate
 baseline JPEG compression and decompression. It is generally 2-4x as fast
 as the unmodified version of libjpeg, all else being equal.
 
+%files
+# empty, meta package
+# exists for compatibility with Fedora/RH
+# cnrdrvcups-ufr2-uk (Canon driver) depends from it
+
 #----------------------------------------------------------------------------
 
 %package -n %{libname}