libjpeg-turbo/libjpeg-turbo-3.0.2-e2k.patch
Alexander Stefanov e8720c9eba fix e2k patch
2024-05-18 13:05:28 +00:00

4897 lines
158 KiB
Diff

From f922bcd2542146dfce9bf3a5e49892c69fe1b0a4 Mon Sep 17 00:00:00 2001
From: Alexander Stefanov <alexander@mezon.ru>
Date: Sat, 18 May 2024 13:04:48 +0000
Subject: [PATCH] add e2k port
---
CMakeLists.txt | 5 +
simd/CMakeLists.txt | 23 ++
simd/e2k/jccolext-e2k.c | 213 +++++++++++
simd/e2k/jccolor-e2k.c | 163 +++++++++
simd/e2k/jchuff-e2k.c | 307 ++++++++++++++++
simd/e2k/jcphuff-e2k.c | 145 ++++++++
simd/e2k/jcsample-e2k.c | 203 +++++++++++
simd/e2k/jcsample.h | 28 ++
simd/e2k/jdcolext-e2k.c | 258 +++++++++++++
simd/e2k/jdcolor-e2k.c | 289 +++++++++++++++
simd/e2k/jdcoltab-e2k.c | 80 ++++
simd/e2k/jdsample-e2k.c | 389 ++++++++++++++++++++
simd/e2k/jfdctflt-e2k.c | 127 +++++++
simd/e2k/jfdctfst-e2k.c | 145 ++++++++
simd/e2k/jfdctint-e2k.c | 255 +++++++++++++
simd/e2k/jidctflt-e2k.c | 215 +++++++++++
simd/e2k/jidctfst-e2k.c | 187 ++++++++++
simd/e2k/jidctint-e2k.c | 294 +++++++++++++++
simd/e2k/jquantf-e2k.c | 121 +++++++
simd/e2k/jquanti-e2k.c | 178 +++++++++
simd/e2k/jsimd.c | 761 +++++++++++++++++++++++++++++++++++++++
simd/e2k/jsimd_api_e2k.h | 94 +++++
simd/e2k/jsimd_e2k.h | 207 +++++++++++
23 files changed, 4687 insertions(+)
create mode 100644 simd/e2k/jccolext-e2k.c
create mode 100644 simd/e2k/jccolor-e2k.c
create mode 100644 simd/e2k/jchuff-e2k.c
create mode 100644 simd/e2k/jcphuff-e2k.c
create mode 100644 simd/e2k/jcsample-e2k.c
create mode 100644 simd/e2k/jcsample.h
create mode 100644 simd/e2k/jdcolext-e2k.c
create mode 100644 simd/e2k/jdcolor-e2k.c
create mode 100644 simd/e2k/jdcoltab-e2k.c
create mode 100644 simd/e2k/jdsample-e2k.c
create mode 100644 simd/e2k/jfdctflt-e2k.c
create mode 100644 simd/e2k/jfdctfst-e2k.c
create mode 100644 simd/e2k/jfdctint-e2k.c
create mode 100644 simd/e2k/jidctflt-e2k.c
create mode 100644 simd/e2k/jidctfst-e2k.c
create mode 100644 simd/e2k/jidctint-e2k.c
create mode 100644 simd/e2k/jquantf-e2k.c
create mode 100644 simd/e2k/jquanti-e2k.c
create mode 100644 simd/e2k/jsimd.c
create mode 100644 simd/e2k/jsimd_api_e2k.h
create mode 100644 simd/e2k/jsimd_e2k.h
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ff9c9c2..fa0364c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -120,6 +120,9 @@ elseif(CMAKE_SYSTEM_PROCESSOR_LC STREQUAL "aarch64" OR
elseif(CMAKE_SYSTEM_PROCESSOR_LC MATCHES "^ppc" OR
CMAKE_SYSTEM_PROCESSOR_LC MATCHES "^powerpc")
set(CPU_TYPE powerpc)
+elseif(CMAKE_SYSTEM_PROCESSOR_LC STREQUAL "e2k" OR
+ CMAKE_SYSTEM_PROCESSOR_LC STREQUAL "elbrus")
+ set(CPU_TYPE e2k)
else()
set(CPU_TYPE ${CMAKE_SYSTEM_PROCESSOR_LC})
endif()
@@ -926,6 +929,8 @@ if(CPU_TYPE STREQUAL "x86_64" OR CPU_TYPE STREQUAL "i386")
elseif(CPU_TYPE STREQUAL "x86_64")
set(DEFAULT_FLOATTEST8 no-fp-contract)
endif()
+elseif(WITH_SIMD AND CPU_TYPE STREQUAL "e2k")
+ set(DEFAULT_FLOATTEST8 sse)
elseif(CPU_TYPE STREQUAL "powerpc" OR CPU_TYPE STREQUAL "arm64")
if(CMAKE_C_COMPILER_ID MATCHES "Clang")
if(CMAKE_C_COMPILER_VERSION VERSION_EQUAL 14.0.0 OR
diff --git a/simd/CMakeLists.txt b/simd/CMakeLists.txt
index 0237955..e61baea 100644
--- a/simd/CMakeLists.txt
+++ b/simd/CMakeLists.txt
@@ -543,6 +543,29 @@ if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED)
set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1)
endif()
+###############################################################################
+# Elbrus (Intrinsics)
+###############################################################################
+
+elseif(CPU_TYPE STREQUAL "e2k")
+
+set(SIMD_SOURCES e2k/jquanti-e2k.c e2k/jquantf-e2k.c
+ e2k/jccolor-e2k.c e2k/jcsample-e2k.c
+ e2k/jdcolor-e2k.c e2k/jdsample-e2k.c
+ e2k/jfdctint-e2k.c e2k/jfdctfst-e2k.c e2k/jfdctflt-e2k.c
+ e2k/jidctint-e2k.c e2k/jidctfst-e2k.c e2k/jidctflt-e2k.c
+ e2k/jchuff-e2k.c e2k/jcphuff-e2k.c)
+
+set_source_files_properties(${SIMD_SOURCES} PROPERTIES
+ COMPILE_FLAGS -msse4.1)
+
+set(SIMD_SOURCES ${SIMD_SOURCES} e2k/jsimd.c)
+
+add_library(simd OBJECT ${SIMD_SOURCES})
+
+if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED)
+ set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1)
+endif()
###############################################################################
# None
diff --git a/simd/e2k/jccolext-e2k.c b/simd/e2k/jccolext-e2k.c
new file mode 100644
index 0000000..49abdb4
--- /dev/null
+++ b/simd/e2k/jccolext-e2k.c
@@ -0,0 +1,213 @@
+/*
+ * Elbrus optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2014, Jay Foad. All Rights Reserved.
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jccolor-e2k.c */
+
+void rgbn_ycc_convert(JDIMENSION img_width, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows, int shuf_idx)
+{
+ JSAMPROW inptr, outptr0, outptr1, outptr2;
+ unsigned char __attribute__((aligned(16))) tmpbuf[PIXELSIZE * 16];
+
+ __m128i pb_zero = _mm_setzero_si128();
+ __m128i pb_shuf0 = VEC_LD(rgb_ycc_shuf_const[shuf_idx]);
+#if PIXELSIZE == 4
+ __m128i rgb3 = pb_zero;
+#else
+ __m128i pb_shuf4 = VEC_LD(rgb_ycc_shuf_const[shuf_idx] + 16);
+#endif
+ __m128i rgb0, rgb1 = pb_zero, rgb2 = pb_zero,
+ rgbg0, rgbg1, rgbg2, rgbg3, rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
+ __m128i y, yl, yh, y0, y1, y2, y3;
+ __m128i cb, cr, crl, crh, cbl, cbh;
+ __m128i cr0, cr1, cr2, cr3, cb0, cb1, cb2, cb3;
+
+ /* Constants */
+ __m128i pw_f0299_f0337 = _mm_setr_epi16(__4X2(F_0_299, F_0_337)),
+ pw_f0114_f0250 = _mm_setr_epi16(__4X2(F_0_114, F_0_250)),
+ pw_mf016_mf033 = _mm_setr_epi16(__4X2(-F_0_168, -F_0_331)),
+ pw_mf008_mf041 = _mm_setr_epi16(__4X2(-F_0_081, -F_0_418)),
+ pw_mf050_f000 = _mm_setr_epi16(__4X2(-F_0_500, 0)),
+ pd_onehalf = _mm_set1_epi32(ONE_HALF),
+ pd_onehalfm1_cj = _mm_set1_epi32(ONE_HALF - 1 + (CENTERJSAMPLE << SCALEBITS));
+#ifdef NEED_ALIGN8
+ ALIGN8_COMMON
+ ALIGN8_VARS(src)
+#endif
+
+ if (img_width > 0)
+ while (--num_rows >= 0) {
+ int num_cols;
+ inptr = *input_buf++;
+ outptr0 = output_buf[0][output_row];
+ outptr1 = output_buf[1][output_row];
+ outptr2 = output_buf[2][output_row];
+ output_row++;
+
+ if (img_width >= 16) {
+#ifdef NEED_ALIGN8
+ ALIGN8_START(inptr, src)
+ inptr += (img_width & -16) * PIXELSIZE;
+#endif
+
+ PRAGMA_E2K("ivdep")
+ for (num_cols = img_width; num_cols >= 16; num_cols -= 16,
+ outptr0 += 16, outptr1 += 16, outptr2 += 16) {
+#ifdef NEED_ALIGN8
+ ALIGN8_READ16(rgb0, src, 0)
+ ALIGN8_READ16(rgb1, src, 1)
+ ALIGN8_READ16(rgb2, src, 2)
+#if PIXELSIZE == 4
+ ALIGN8_READ16(rgb3, src, 3)
+#endif
+ src_ptr += PIXELSIZE * 2;
+#else
+ rgb0 = VEC_LD(inptr);
+ rgb1 = VEC_LD(inptr + 16);
+ rgb2 = VEC_LD(inptr + 32);
+#if PIXELSIZE == 4
+ rgb3 = VEC_LD(inptr + 48);
+#endif
+ inptr += PIXELSIZE * 16;
+#endif
+ RGB_SHUFFLE
+ CALC_Y(outptr0)
+ CALC_CC(outptr1, outptr2)
+ }
+ }
+
+ num_cols = img_width & 15;
+ if (num_cols) {
+ int i;
+ memcpy(tmpbuf, inptr, num_cols * PIXELSIZE);
+ rgb0 = VEC_LD(tmpbuf);
+ rgb1 = VEC_LD(tmpbuf + 16);
+ rgb2 = VEC_LD(tmpbuf + 32);
+#if PIXELSIZE == 4
+ rgb3 = VEC_LD(tmpbuf + 48);
+#endif
+ RGB_SHUFFLE
+ CALC_Y(tmpbuf)
+ CALC_CC(tmpbuf + 16, tmpbuf + 32)
+
+ for (i = 0; i < num_cols; i++) {
+ outptr0[i] = tmpbuf[i];
+ outptr1[i] = tmpbuf[i + 16];
+ outptr2[i] = tmpbuf[i + 32];
+ }
+ }
+ }
+}
+
+void rgbn_gray_convert(JDIMENSION img_width, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows, int shuf_idx)
+{
+ JSAMPROW inptr, outptr;
+ uint8_t __attribute__((aligned(16))) tmpbuf[PIXELSIZE * 16];
+
+ __m128i pb_zero = _mm_setzero_si128();
+ __m128i pb_shuf0 = VEC_LD(rgb_ycc_shuf_const[shuf_idx]);
+#if PIXELSIZE == 4
+ __m128i rgb3 = pb_zero;
+#else
+ __m128i pb_shuf4 = VEC_LD(rgb_ycc_shuf_const[shuf_idx] + 16);
+#endif
+ __m128i rgb0, rgb1 = pb_zero, rgb2 = pb_zero,
+ rgbg0, rgbg1, rgbg2, rgbg3, rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
+ __m128i y, yl, yh, y0, y1, y2, y3;
+
+ /* Constants */
+ __m128i pw_f0299_f0337 = _mm_setr_epi16(__4X2(F_0_299, F_0_337)),
+ pw_f0114_f0250 = _mm_setr_epi16(__4X2(F_0_114, F_0_250)),
+ pd_onehalf = _mm_set1_epi32(ONE_HALF);
+#ifdef NEED_ALIGN8
+ ALIGN8_COMMON
+ ALIGN8_VARS(src)
+#endif
+
+ if (img_width > 0)
+ while (--num_rows >= 0) {
+ int num_cols;
+ inptr = *input_buf++;
+ outptr = output_buf[0][output_row];
+ output_row++;
+
+ if (img_width >= 16) {
+#ifdef NEED_ALIGN8
+ ALIGN8_START(inptr, src)
+ inptr += (img_width & -16) * PIXELSIZE;
+#endif
+
+ PRAGMA_E2K("ivdep")
+ for (num_cols = img_width; num_cols >= 16; num_cols -= 16,
+ outptr += 16) {
+#ifdef NEED_ALIGN8
+ ALIGN8_READ16(rgb0, src, 0)
+ ALIGN8_READ16(rgb1, src, 1)
+ ALIGN8_READ16(rgb2, src, 2)
+#if PIXELSIZE == 4
+ ALIGN8_READ16(rgb3, src, 3)
+#endif
+ src_ptr += PIXELSIZE * 2;
+#else
+ rgb0 = VEC_LD(inptr);
+ rgb1 = VEC_LD(inptr + 16);
+ rgb2 = VEC_LD(inptr + 32);
+#if PIXELSIZE == 4
+ rgb3 = VEC_LD(inptr + 48);
+#endif
+ inptr += PIXELSIZE * 16;
+#endif
+ RGB_SHUFFLE
+ CALC_Y(outptr)
+ }
+ }
+
+ num_cols = img_width & 15;
+ if (num_cols) {
+ int i;
+ memcpy(tmpbuf, inptr, num_cols * PIXELSIZE);
+ rgb0 = VEC_LD(tmpbuf);
+ rgb1 = VEC_LD(tmpbuf + 16);
+ rgb2 = VEC_LD(tmpbuf + 32);
+#if PIXELSIZE == 4
+ rgb3 = VEC_LD(tmpbuf + 48);
+#endif
+ RGB_SHUFFLE
+ CALC_Y(tmpbuf)
+
+ for (i = 0; i < num_cols; i++) {
+ outptr[i] = tmpbuf[i];
+ }
+ }
+ }
+}
+
+#undef RGB_SHUFFLE
+#undef PIXELSIZE
+#undef rgbn_ycc_convert
+#undef rgbn_gray_convert
+
diff --git a/simd/e2k/jccolor-e2k.c b/simd/e2k/jccolor-e2k.c
new file mode 100644
index 0000000..0af2626
--- /dev/null
+++ b/simd/e2k/jccolor-e2k.c
@@ -0,0 +1,163 @@
+/*
+ * Elbrus optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* RGB --> YCC CONVERSION */
+
+#include "jsimd_e2k.h"
+
+#define F_0_081 5329 /* FIX(0.08131) */
+#define F_0_114 7471 /* FIX(0.11400) */
+#define F_0_168 11059 /* FIX(0.16874) */
+#define F_0_250 16384 /* FIX(0.25000) */
+#define F_0_299 19595 /* FIX(0.29900) */
+#define F_0_331 21709 /* FIX(0.33126) */
+#define F_0_418 27439 /* FIX(0.41869) */
+#define F_0_500 32768 /* FIX(0.50000) */
+#define F_0_587 38470 /* FIX(0.58700) */
+#define F_0_337 (F_0_587 - F_0_250) /* FIX(0.58700) - FIX(0.25000) */
+#define F_0_413 (65536 - F_0_587) /* FIX(1.00000) - FIX(0.58700) */
+
+#define SCALEBITS 16
+#define ONE_HALF (1 << (SCALEBITS - 1))
+
+#define RGBG_INDEX_(name, color, i, x) \
+ name##_##color + i * name##_PIXELSIZE + x, \
+ name##_GREEN + i * name##_PIXELSIZE + x
+#define RGBG_INDEX(name, x) \
+ RGBG_INDEX_(name, RED, 0, x), RGBG_INDEX_(name, RED, 1, x), \
+ RGBG_INDEX_(name, RED, 2, x), RGBG_INDEX_(name, RED, 3, x), \
+ RGBG_INDEX_(name, BLUE, 0, x), RGBG_INDEX_(name, BLUE, 1, x), \
+ RGBG_INDEX_(name, BLUE, 2, x), RGBG_INDEX_(name, BLUE, 3, x)
+
+static const uint8_t __attribute__((aligned(16)))
+rgb_ycc_shuf_const[7][32] = {
+ { RGBG_INDEX(RGB, 0), RGBG_INDEX(RGB, 4) },
+ { RGBG_INDEX(EXT_RGB, 0), RGBG_INDEX(EXT_RGB, 4) },
+ { RGBG_INDEX(EXT_RGBX, 0), RGBG_INDEX(EXT_RGBX, 4) },
+ { RGBG_INDEX(EXT_BGR, 0), RGBG_INDEX(EXT_BGR, 4) },
+ { RGBG_INDEX(EXT_BGRX, 0), RGBG_INDEX(EXT_BGRX, 4) },
+ { RGBG_INDEX(EXT_XBGR, 0), RGBG_INDEX(EXT_XBGR, 4) },
+ { RGBG_INDEX(EXT_XRGB, 0), RGBG_INDEX(EXT_XRGB, 4) }
+};
+
+ /* rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
+ * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
+ * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
+ * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
+ *
+ * rg0 = R0 G0 R1 G1 R2 G2 R3 G3
+ * bg0 = B0 G0 B1 G1 B2 G2 B3 G3
+ * ...
+ */
+
+ /* (Original)
+ * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+ *
+ * (This implementation)
+ * Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+ * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+ */
+
+#define CALC_Y(outptr0) \
+ rg0 = _mm_unpacklo_epi8(rgbg0, pb_zero); \
+ bg0 = _mm_unpackhi_epi8(rgbg0, pb_zero); \
+ rg1 = _mm_unpacklo_epi8(rgbg1, pb_zero); \
+ bg1 = _mm_unpackhi_epi8(rgbg1, pb_zero); \
+ rg2 = _mm_unpacklo_epi8(rgbg2, pb_zero); \
+ bg2 = _mm_unpackhi_epi8(rgbg2, pb_zero); \
+ rg3 = _mm_unpacklo_epi8(rgbg3, pb_zero); \
+ bg3 = _mm_unpackhi_epi8(rgbg3, pb_zero); \
+ \
+ /* Calculate Y values */ \
+ y0 = _mm_add_epi32(_mm_madd_epi16(rg0, pw_f0299_f0337), pd_onehalf); \
+ y1 = _mm_add_epi32(_mm_madd_epi16(rg1, pw_f0299_f0337), pd_onehalf); \
+ y2 = _mm_add_epi32(_mm_madd_epi16(rg2, pw_f0299_f0337), pd_onehalf); \
+ y3 = _mm_add_epi32(_mm_madd_epi16(rg3, pw_f0299_f0337), pd_onehalf); \
+ y0 = _mm_add_epi32(_mm_madd_epi16(bg0, pw_f0114_f0250), y0); \
+ y1 = _mm_add_epi32(_mm_madd_epi16(bg1, pw_f0114_f0250), y1); \
+ y2 = _mm_add_epi32(_mm_madd_epi16(bg2, pw_f0114_f0250), y2); \
+ y3 = _mm_add_epi32(_mm_madd_epi16(bg3, pw_f0114_f0250), y3); \
+ \
+ yl = _mm_packhi_epi32(y0, y1); \
+ yh = _mm_packhi_epi32(y2, y3); \
+ y = _mm_packus_epi16(yl, yh); \
+ VEC_ST(outptr0, y);
+
+#define CALC_CC(outptr1, outptr2) \
+ /* Calculate Cb values */ \
+ cb0 = _mm_add_epi32(_mm_madd_epi16(rg0, pw_mf016_mf033), pd_onehalfm1_cj); \
+ cb1 = _mm_add_epi32(_mm_madd_epi16(rg1, pw_mf016_mf033), pd_onehalfm1_cj); \
+ cb2 = _mm_add_epi32(_mm_madd_epi16(rg2, pw_mf016_mf033), pd_onehalfm1_cj); \
+ cb3 = _mm_add_epi32(_mm_madd_epi16(rg3, pw_mf016_mf033), pd_onehalfm1_cj); \
+ cb0 = _mm_sub_epi32(cb0, _mm_madd_epi16(bg0, pw_mf050_f000)); \
+ cb1 = _mm_sub_epi32(cb1, _mm_madd_epi16(bg1, pw_mf050_f000)); \
+ cb2 = _mm_sub_epi32(cb2, _mm_madd_epi16(bg2, pw_mf050_f000)); \
+ cb3 = _mm_sub_epi32(cb3, _mm_madd_epi16(bg3, pw_mf050_f000)); \
+ \
+ cbl = _mm_packhi_epi32(cb0, cb1); \
+ cbh = _mm_packhi_epi32(cb2, cb3); \
+ cb = _mm_packus_epi16(cbl, cbh); \
+ VEC_ST(outptr1, cb); \
+ \
+ /* Calculate Cr values */ \
+ cr0 = _mm_add_epi32(_mm_madd_epi16(bg0, pw_mf008_mf041), pd_onehalfm1_cj); \
+ cr1 = _mm_add_epi32(_mm_madd_epi16(bg1, pw_mf008_mf041), pd_onehalfm1_cj); \
+ cr2 = _mm_add_epi32(_mm_madd_epi16(bg2, pw_mf008_mf041), pd_onehalfm1_cj); \
+ cr3 = _mm_add_epi32(_mm_madd_epi16(bg3, pw_mf008_mf041), pd_onehalfm1_cj); \
+ cr0 = _mm_sub_epi32(cr0, _mm_madd_epi16(rg0, pw_mf050_f000)); \
+ cr1 = _mm_sub_epi32(cr1, _mm_madd_epi16(rg1, pw_mf050_f000)); \
+ cr2 = _mm_sub_epi32(cr2, _mm_madd_epi16(rg2, pw_mf050_f000)); \
+ cr3 = _mm_sub_epi32(cr3, _mm_madd_epi16(rg3, pw_mf050_f000)); \
+ \
+ crl = _mm_packhi_epi32(cr0, cr1); \
+ crh = _mm_packhi_epi32(cr2, cr3); \
+ cr = _mm_packus_epi16(crl, crh); \
+ VEC_ST(outptr2, cr);
+
+
+#define PIXELSIZE 3
+#define RGB_SHUFFLE \
+ rgbg0 = _mm_shuffle_epi8(rgb0, pb_shuf0); \
+ rgbg1 = _mm_shuffle_epi8(VEC_ALIGNR8(rgb1, rgb0), pb_shuf4); \
+ rgbg2 = _mm_shuffle_epi8(VEC_ALIGNR8(rgb2, rgb1), pb_shuf0); \
+ rgbg3 = _mm_shuffle_epi8(rgb2, pb_shuf4);
+
+#define rgbn_ycc_convert jsimd_rgb3_ycc_convert_e2k
+#define rgbn_gray_convert jsimd_rgb3_gray_convert_e2k
+#include "jccolext-e2k.c"
+
+
+#define PIXELSIZE 4
+#define RGB_SHUFFLE \
+ rgbg0 = _mm_shuffle_epi8(rgb0, pb_shuf0); \
+ rgbg1 = _mm_shuffle_epi8(rgb1, pb_shuf0); \
+ rgbg2 = _mm_shuffle_epi8(rgb2, pb_shuf0); \
+ rgbg3 = _mm_shuffle_epi8(rgb3, pb_shuf0);
+
+#define rgbn_ycc_convert jsimd_rgb4_ycc_convert_e2k
+#define rgbn_gray_convert jsimd_rgb4_gray_convert_e2k
+#include "jccolext-e2k.c"
+
diff --git a/simd/e2k/jchuff-e2k.c b/simd/e2k/jchuff-e2k.c
new file mode 100644
index 0000000..ec4329e
--- /dev/null
+++ b/simd/e2k/jchuff-e2k.c
@@ -0,0 +1,307 @@
+/*
+ * Elbrus optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2022, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *
+ * NOTE: All referenced figures are from
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
+ */
+
+/* Encode a single block's worth of coefficients */
+
+#include "jsimd_e2k.h"
+
+#if __SIZEOF_SIZE_T__ != 8
+#error
+#endif
+
+typedef unsigned long long bit_buf_type;
+#define BIT_BUF_SIZE 64
+
+typedef struct {
+ bit_buf_type put_buffer; /* current bit accumulation buffer */
+ int free_bits; /* # of bits available in it */
+ int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
+} savable_state;
+
+typedef struct {
+ JOCTET *next_output_byte; /* => next byte to write in buffer */
+ size_t free_in_buffer; /* # of byte spaces remaining in buffer */
+ savable_state cur; /* Current bit buffer & DC state */
+ j_compress_ptr cinfo; /* dump_buffer needs access to this */
+ int simd;
+} working_state;
+
+#define EMIT_BYTE(b) { \
+ buffer[0] = (JOCTET)(b); \
+ buffer[1] = 0; \
+ buffer -= -2 + ((JOCTET)(b) < 0xFF); \
+}
+
+#define FLUSH() { \
+ if (put_buffer & 0x8080808080808080 & ~(put_buffer + 0x0101010101010101)) { \
+ EMIT_BYTE(put_buffer >> 56) \
+ EMIT_BYTE(put_buffer >> 48) \
+ EMIT_BYTE(put_buffer >> 40) \
+ EMIT_BYTE(put_buffer >> 32) \
+ EMIT_BYTE(put_buffer >> 24) \
+ EMIT_BYTE(put_buffer >> 16) \
+ EMIT_BYTE(put_buffer >> 8) \
+ EMIT_BYTE(put_buffer ) \
+ } else { \
+ *(uint64_t*)buffer = __builtin_bswap64(put_buffer); \
+ buffer += 8; \
+ } \
+}
+
+#define PUT_AND_FLUSH(code, size) { \
+ put_buffer = (put_buffer << (size + free_bits)) | (code >> -free_bits); \
+ FLUSH() \
+ free_bits += BIT_BUF_SIZE; \
+ put_buffer = code; \
+}
+
+#define PUT_BITS(code, size) { \
+ free_bits -= size; \
+ if (free_bits < 0) \
+ PUT_AND_FLUSH(code, size) \
+ else \
+ put_buffer = (put_buffer << size) | code; \
+}
+
+#define PUT_CODE(code, size) { \
+ /* temp &= (((JLONG)1) << nbits) - 1; */ \
+ /* temp |= code << nbits; */ \
+ temp = __builtin_e2k_insfd(code, __builtin_e2k_insfd(nbits, 6 * 63 + 64, -nbits), temp); \
+ nbits += size; \
+ PUT_BITS(temp, nbits) \
+}
+
+#define KLOOP_PREPARE(mask, i) \
+ t0 = _mm_cmpeq_epi8(_mm_packs_epi16(v0, v1), zero); \
+ t1 = _mm_cmpeq_epi8(_mm_packs_epi16(v2, v3), zero); \
+ mask = (uint32_t)(_mm_movemask_epi8(t0) | _mm_movemask_epi8(t1) << 16); \
+ t0 = _mm_add_epi16(_mm_srai_epi16(v0, 15), v0); \
+ t1 = _mm_add_epi16(_mm_srai_epi16(v1, 15), v1); \
+ t2 = _mm_add_epi16(_mm_srai_epi16(v2, 15), v2); \
+ t3 = _mm_add_epi16(_mm_srai_epi16(v3, 15), v3); \
+ v0 = _mm_abs_epi16(v0); \
+ v1 = _mm_abs_epi16(v1); \
+ v2 = _mm_abs_epi16(v2); \
+ v3 = _mm_abs_epi16(v3); \
+ VEC_ST(block_nbits + i, v0); \
+ VEC_ST(block_nbits + i + 8, v1); \
+ VEC_ST(block_nbits + i + 16, v2); \
+ VEC_ST(block_nbits + i + 24, v3); \
+ VEC_ST(block_diff + i, t0); \
+ VEC_ST(block_diff + i + 8, t1); \
+ VEC_ST(block_diff + i + 16, t2); \
+ VEC_ST(block_diff + i + 24, t3);
+
+#define SHUF16X4(a, b, c, d) _mm_setr_pi8( \
+ a * 2, a * 2 + 1, b * 2, b * 2 + 1, c * 2, c * 2 + 1, d * 2, d * 2 + 1)
+#define VEC_COMBINE(h0, h1) _mm_unpacklo_epi64( \
+ _mm_movpi64_epi64(h0), _mm_movpi64_epi64(h1))
+#define INSFI_M64(a, b, c, d) _mm_cvtsi64_m64(__builtin_e2k_insfd( \
+ _mm_cvtm64_si64(a), (b & 63) | (d & 63) << 6, _mm_cvtm64_si64(c)))
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block_e2k(void *state, JOCTET *buffer,
+ JCOEFPTR block, int last_dc_val,
+ c_derived_tbl *dctbl, c_derived_tbl *actbl) {
+ uint64_t temp, nbits;
+ uint64_t i, r, code, size;
+ uint64_t code_0xf0 = actbl->ehufco[0xf0];
+ uint64_t size_0xf0 = actbl->ehufsi[0xf0];
+
+ working_state *state_ptr = (working_state*)state;
+ bit_buf_type put_buffer = state_ptr->cur.put_buffer;
+ int64_t free_bits = state_ptr->cur.free_bits;
+
+ __m128i zero = _mm_setzero_si128();
+ __m128i v0, v1, v2, v3, t0, t1, t2, t3;
+ int64_t mask, mask1;
+ uint16_t __attribute__((aligned(16))) block_nbits[DCTSIZE2];
+ int16_t __attribute__((aligned(16))) block_diff[DCTSIZE2];
+
+#if 1 /* faster this way */
+ {
+ __m64 d0l, d0h, d1l, d1h, d2l, d2h, d3l, d3h;
+ __m64 d4l, d4h, d5l, d5h, d6l, d6h, d7l, d7h;
+ __m64 h0, h1, h2, h3, r0, r1, c1256 = SHUF16X4(1, 2, 5, 6);
+
+ d0l = *(__m64*)(block + 8 * 0); d0h = *(__m64*)(block + 8 * 0 + 4); // 0 4
+ d1l = *(__m64*)(block + 8 * 1); d1h = *(__m64*)(block + 8 * 1 + 4); // 8 12
+ d2l = *(__m64*)(block + 8 * 2); d2h = *(__m64*)(block + 8 * 2 + 4); // 16 20
+ d3l = *(__m64*)(block + 8 * 3); d3h = *(__m64*)(block + 8 * 3 + 4); // 24 28
+ d4l = *(__m64*)(block + 8 * 4); d4h = *(__m64*)(block + 8 * 4 + 4); // 32 36
+ d5l = *(__m64*)(block + 8 * 5); d5h = *(__m64*)(block + 8 * 5 + 4); // 40 44
+ d6l = *(__m64*)(block + 8 * 6); d6h = *(__m64*)(block + 8 * 6 + 4); // 48 52
+ d7l = *(__m64*)(block + 8 * 7); d7h = *(__m64*)(block + 8 * 7 + 4); // 56 60
+
+ // d0l[0] d0l[1] d1l[0] d2l[0]
+ // d1l[1] d0l[2] d0l[3] d1l[2]
+ h0 = _mm_unpacklo_pi16(d1l, d2l);
+ r0 = _mm_unpacklo_pi32(d0l, h0);
+ r1 = _mm_shuffle2_pi8(d1l, d0l, SHUF16X4(1, 6, 7, 2));
+ r0 = _mm_sub_pi16(r0, _mm_cvtsi64_m64((uint16_t)last_dc_val));
+ v0 = VEC_COMBINE(r0, r1);
+
+ // d2l[1] d3l[0] d4l[0] d3l[1]
+ // d2l[2] d1l[3] d0h[0] d0h[1]
+ h0 = _mm_srli_si64(_mm_unpacklo_pi32(d2l, d4l), 16);
+ h2 = INSFI_M64(d1l, 0, d2l, 48);
+ r0 = _mm_unpacklo_pi16(h0, d3l);
+ r1 = _mm_alignr_pi8(d0h, h2, 4);
+ v1 = VEC_COMBINE(r0, r1);
+
+ // d1h[0] d2l[3] d3l[2] d4l[1]
+ // d5l[0] d6l[0] d5l[1] d4l[2]
+ h0 = INSFI_M64(d2l, 32, d1h, 16);
+ h1 = INSFI_M64(d4l, -32, d3l, 48);
+ h2 = INSFI_M64(d4l, 16, d6l, 16);
+ r0 = INSFI_M64(h1, 0, h0, 32);
+ r1 = _mm_unpacklo_pi16(d5l, h2);
+ v2 = VEC_COMBINE(r0, r1);
+
+ // d3l[3] d2h[0] d1h[1] d0h[2]
+ // d0h[3] d1h[2] d2h[1] d3h[0]
+ h0 = _mm_alignr_pi8(d2h, d3l, 6);
+ h1 = INSFI_M64(d0h, 0, d1h, 32);
+ h2 = _mm_unpackhi_pi32(d0h, d1h);
+ h3 = _mm_unpacklo_pi32(d2h, d3h);
+ r0 = INSFI_M64(h1, -16, h0, 32);
+ r1 = _mm_shuffle2_pi8(h2, h3, c1256);
+ v3 = VEC_COMBINE(r0, r1);
+
+ KLOOP_PREPARE(mask, 0)
+
+ // d4l[3] d5l[2] d6l[1] d7l[0]
+ // d7l[1] d6l[2] d5l[3] d4h[0]
+ h0 = _mm_unpackhi_pi32(d4l, d5l);
+ h1 = _mm_unpacklo_pi32(d6l, d7l);
+ h2 = INSFI_M64(d6l, 0, d7l, 32);
+ h2 = INSFI_M64(d5l, 0, h2, 48);
+ r0 = _mm_shuffle2_pi8(h0, h1, c1256);
+ r1 = _mm_alignr_pi8(d4h, h2, 2);
+ v0 = VEC_COMBINE(r0, r1);
+
+ // d3h[1] d2h[2] d1h[3] d2h[3]
+ // d3h[2] d4h[1] d5h[0] d6l[3]
+ h0 = _mm_slli_si64(INSFI_M64(d1h, 16, d3h, 32), 16);
+ h2 = INSFI_M64(d4h, -32, d3h, 48);
+ h3 = INSFI_M64(d6l, 32, d5h, 16);
+ r0 = _mm_unpackhi_pi16(h0, d2h);
+ r1 = _mm_alignr_pi8(h3, h2, 4);
+ v1 = VEC_COMBINE(r0, r1);
+
+ // d7l[2] d7l[3] d6h[0] d5h[1]
+ // d4h[2] d3h[3] d4h[3] d5h[2]
+ h0 = INSFI_M64(d5h, 0, d6h, 16);
+ h2 = _mm_slli_si64(_mm_unpackhi_pi32(d3h, d5h), 16);
+ r0 = _mm_alignr_pi8(h0, d7l, 4);
+ r1 = _mm_unpackhi_pi16(d4h, h2);
+ v2 = VEC_COMBINE(r0, r1);
+
+ // d6h[1] d7h[0] d7h[1] d6h[2]
+ // d5h[3] d6h[3] d7h[2] d7h[3]
+ h0 = INSFI_M64(d6h, -16, d7h, 32);
+ h2 = _mm_unpackhi_pi16(d5h, d6h);
+ r0 = _mm_shuffle_pi16(h0, 0xd2);
+ r1 = _mm_unpackhi_pi32(h2, d7h);
+ v3 = VEC_COMBINE(r0, r1);
+ }
+#else
+ v0 = _mm_setr_epi16(
+ block[0] - last_dc_val, block[1], block[8], block[16],
+ block[9], block[2], block[3], block[10]);
+ v1 = _mm_setr_epi16(
+ block[17], block[24], block[32], block[25],
+ block[18], block[11], block[4], block[5]);
+ v2 = _mm_setr_epi16(
+ block[12], block[19], block[26], block[33],
+ block[40], block[48], block[41], block[34]);
+ v3 = _mm_setr_epi16(
+ block[27], block[20], block[13], block[6],
+ block[7], block[14], block[21], block[28]);
+
+ KLOOP_PREPARE(mask, 0)
+
+ v0 = _mm_setr_epi16(
+ block[35], block[42], block[49], block[56],
+ block[57], block[50], block[43], block[36]);
+ v1 = _mm_setr_epi16(
+ block[29], block[22], block[15], block[23],
+ block[30], block[37], block[44], block[51]);
+ v2 = _mm_setr_epi16(
+ block[58], block[59], block[52], block[45],
+ block[38], block[31], block[39], block[46]);
+ v3 = _mm_setr_epi16(
+ block[53], block[60], block[61], block[54],
+ block[47], block[55], block[62], block[63]);
+#endif
+
+ KLOOP_PREPARE(mask1, 32)
+ mask |= mask1 << 32;
+ mask = ~mask;
+
+ /* Encode the DC coefficient difference per section F.1.2.1 */
+
+ nbits = block_nbits[0];
+ temp = block_diff[0];
+ nbits = nbits ? 32 - __builtin_clz(nbits) : 0;
+
+ /* Emit the Huffman-coded symbol for the number of bits */
+ code = dctbl->ehufco[nbits];
+ size = dctbl->ehufsi[nbits];
+ PUT_CODE(code, size)
+
+ /* Encode the AC coefficients per section F.1.2.2 */
+
+ /* e2k doesn't have a tzcnt instruction */
+ mask = __builtin_e2k_bitrevd(mask) << 1;
+
+ for (i = 1; mask; i++, mask <<= 1) {
+ r = __builtin_clzll(mask);
+ mask <<= r;
+ i += r;
+ /* if run length > 15, must emit special run-length-16 codes (0xF0) */
+ while (r > 15) {
+ PUT_BITS(code_0xf0, size_0xf0)
+ r -= 16;
+ }
+ nbits = block_nbits[i];
+ temp = block_diff[i];
+ nbits = 32 - __builtin_clz(nbits);
+ /* Emit Huffman symbol for run length / number of bits */
+ /* r = r << 4 | nbits; */
+ r = __builtin_e2k_insfd(r, 4 * 63 + 64, nbits);
+ code = actbl->ehufco[r];
+ size = actbl->ehufsi[r];
+ PUT_CODE(code, size)
+ }
+
+ if (i != 64) {
+ PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0])
+ }
+
+ state_ptr->cur.put_buffer = put_buffer;
+ state_ptr->cur.free_bits = free_bits;
+ return buffer;
+}
diff --git a/simd/e2k/jcphuff-e2k.c b/simd/e2k/jcphuff-e2k.c
new file mode 100644
index 0000000..f69afeb
--- /dev/null
+++ b/simd/e2k/jcphuff-e2k.c
@@ -0,0 +1,145 @@
+/*
+ * Elbrus optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2022, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#include "jsimd_e2k.h"
+
+#define X(i) coefs[i] = block[jpeg_natural_order_start[i]];
+#define Y(i) coefs[i] = i < rem ? block[jpeg_natural_order_start[i]] : 0;
+
+#define LOOP \
+ for (i = 0; i < Sl >> 4; i++) { \
+ X(0) X(1) X(2) X(3) X(4) X(5) X(6) X(7) \
+ X(8) X(9) X(10) X(11) X(12) X(13) X(14) X(15) \
+ BLOCK16 \
+ jpeg_natural_order_start += 16; \
+ } \
+ rem = Sl & 15; \
+ if (Sl & 8) { \
+ X(0) X(1) X(2) X(3) X(4) X(5) X(6) X(7) \
+ Y(8) Y(9) Y(10) Y(11) Y(12) Y(13) Y(14) \
+ coefs[15] = 0; \
+ BLOCK16 \
+ } else if (rem > 0) { \
+ Y(0) Y(1) Y(2) Y(3) Y(4) Y(5) Y(6) Y(7) \
+ BLOCK8 \
+ }
+
+void jsimd_encode_mcu_AC_first_prepare_e2k
+ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+ JCOEF *values, size_t *zerobits)
+{
+ JCOEF *diff = values + DCTSIZE2;
+ int16_t __attribute__((aligned(16))) coefs[16];
+ __m128i v0, v1, v2, v3;
+ __m128i c0 = _mm_setzero_si128(), shr = _mm_cvtsi32_si128(Al);
+ int i, rem;
+
+#define BLOCK16 \
+ v0 = _mm_load_si128((__m128i*)coefs); \
+ v1 = _mm_load_si128((__m128i*)coefs + 1); \
+ v2 = _mm_srai_epi16(v0, 15); \
+ v3 = _mm_srai_epi16(v1, 15); \
+ v0 = _mm_sra_epi16(_mm_abs_epi16(v0), shr); \
+ v1 = _mm_sra_epi16(_mm_abs_epi16(v1), shr); \
+ v2 = _mm_xor_si128(v0, v2); \
+ v3 = _mm_xor_si128(v1, v3); \
+ _mm_store_si128((__m128i*)values, v0); \
+ _mm_store_si128((__m128i*)values + 1, v1); \
+ _mm_store_si128((__m128i*)diff, v2); \
+ _mm_store_si128((__m128i*)diff + 1, v3); \
+ v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, v1), c0); \
+ ((uint16_t*)zerobits)[i] = ~_mm_movemask_epi8(v2); \
+ values += 16; diff += 16;
+
+#define BLOCK8 \
+ v0 = _mm_load_si128((__m128i*)coefs); \
+ v2 = _mm_srai_epi16(v0, 15); \
+ v0 = _mm_sra_epi16(_mm_abs_epi16(v0), shr); \
+ v2 = _mm_xor_si128(v0, v2); \
+ _mm_store_si128((__m128i*)values, v0); \
+ _mm_store_si128((__m128i*)diff, v2); \
+ v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, c0), c0); \
+ ((uint16_t*)zerobits)[i] = ~_mm_movemask_epi8(v2); \
+ values += 8; diff += 8;
+
+ ((uint64_t*)zerobits)[0] = 0;
+ LOOP
+#undef BLOCK16
+#undef BLOCK8
+
+ for (i = (64 - Sl) >> 3; i; i--) {
+ _mm_store_si128((__m128i*)values, c0);
+ _mm_store_si128((__m128i*)diff, c0);
+ values += 8; diff += 8;
+ }
+}
+
+int jsimd_encode_mcu_AC_refine_prepare_e2k
+ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+ JCOEF *absvalues, size_t *bits)
+{
+ union { uint64_t q; uint16_t w[4]; } mask1 = { 0 };
+ int16_t __attribute__((aligned(16))) coefs[16];
+ __m128i v0, v1, v2, c1 = _mm_set1_epi8(1);
+ __m128i c0 = _mm_setzero_si128(), shr = _mm_cvtsi32_si128(Al);
+ int i, rem;
+
+#define BLOCK16 \
+ v0 = _mm_load_si128((__m128i*)coefs); \
+ v1 = _mm_load_si128((__m128i*)coefs + 1); \
+ v2 = _mm_packs_epi16(v0, v1); \
+ ((uint16_t*)bits)[4 + i] = ~_mm_movemask_epi8(v2); \
+ v0 = _mm_sra_epi16(_mm_abs_epi16(v0), shr); \
+ v1 = _mm_sra_epi16(_mm_abs_epi16(v1), shr); \
+ v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, v1), c0); \
+ ((uint16_t*)bits)[i] = ~_mm_movemask_epi8(v2); \
+ _mm_store_si128((__m128i*)absvalues, v0); \
+ _mm_store_si128((__m128i*)absvalues + 1, v1); \
+ v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, v1), c1); \
+ mask1.w[i] = _mm_movemask_epi8(v2); \
+ absvalues += 16;
+
+#define BLOCK8 \
+ v0 = _mm_load_si128((__m128i*)coefs); \
+ v2 = _mm_packs_epi16(v0, c0); \
+ ((uint16_t*)bits)[4 + i] = ~_mm_movemask_epi8(v2); \
+ v0 = _mm_sra_epi16(_mm_abs_epi16(v0), shr); \
+ v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, c0), c0); \
+ ((uint16_t*)bits)[i] = ~_mm_movemask_epi8(v2); \
+ _mm_store_si128((__m128i*)absvalues, v0); \
+ v2 = _mm_cmpeq_epi8(_mm_packus_epi16(v0, c0), c1); \
+ mask1.w[i] = _mm_movemask_epi8(v2); \
+ absvalues += 8;
+
+ ((uint64_t*)bits)[0] = 0; /* zero */
+ ((uint64_t*)bits)[1] = 0; /* sign */
+ LOOP
+#undef BLOCK16
+#undef BLOCK8
+
+ for (i = (64 - Sl) >> 3; i; i--) {
+ _mm_store_si128((__m128i*)absvalues, c0);
+ absvalues += 8;
+ }
+
+ return 63 - __builtin_clzll(mask1.q | 1);
+}
diff --git a/simd/e2k/jcsample-e2k.c b/simd/e2k/jcsample-e2k.c
new file mode 100644
index 0000000..cac8897
--- /dev/null
+++ b/simd/e2k/jcsample-e2k.c
@@ -0,0 +1,203 @@
+/*
+ * Elbrus optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* CHROMA DOWNSAMPLING */
+
+#include "jsimd_e2k.h"
+#include "jcsample.h"
+
+void jsimd_h2v1_downsample_e2k(JDIMENSION image_width,
+ int max_v_samp_factor,
+ JDIMENSION v_samp_factor,
+ JDIMENSION width_in_blocks,
+ JSAMPARRAY input_data,
+ JSAMPARRAY output_data)
+{
+ int outcol;
+ JDIMENSION output_cols = width_in_blocks * DCTSIZE, outrow;
+ JSAMPROW inptr, outptr;
+
+ __m128i this0, next0, out;
+ __m128i this0e, this0o, next0e, next0o, outl, outh;
+
+ /* Constants */
+ __m128i pw_bias = _mm_set1_epi32(1 << 16),
+ even_mask = _mm_set1_epi16(255);
+#ifdef NEED_ALIGN8
+ ALIGN8_COMMON
+ ALIGN8_VARS(src)
+#endif
+
+ expand_right_edge(input_data, max_v_samp_factor,
+ image_width, output_cols * 2);
+
+ if (output_cols > 0)
+ for (outrow = 0; outrow < v_samp_factor; outrow++) {
+ outptr = output_data[outrow];
+ inptr = input_data[outrow];
+
+#ifdef NEED_ALIGN8
+ ALIGN8_START(inptr, src)
+#endif
+ PRAGMA_E2K("ivdep")
+ for (outcol = output_cols; outcol > 8;
+ outcol -= 16, outptr += 16) {
+
+#ifdef NEED_ALIGN8
+ ALIGN8_READ16(this0, src, 0)
+ ALIGN8_READ16(next0, src, 1)
+ src_ptr += 4;
+#else
+ this0 = VEC_LD(inptr);
+ next0 = VEC_LD(inptr + 16);
+ inptr += 32;
+#endif
+ this0e = _mm_and_si128(this0, even_mask);
+ this0o = _mm_srli_epi16(this0, 8);
+ outl = _mm_add_epi16(this0e, this0o);
+ outl = _mm_srli_epi16(_mm_add_epi16(outl, pw_bias), 1);
+ next0e = _mm_and_si128(next0, even_mask);
+ next0o = _mm_srli_epi16(next0, 8);
+ outh = _mm_add_epi16(next0e, next0o);
+ outh = _mm_srli_epi16(_mm_add_epi16(outh, pw_bias), 1);
+
+ out = _mm_packus_epi16(outl, outh);
+ VEC_ST(outptr, out);
+ }
+ if (outcol > 0) {
+#ifdef NEED_ALIGN8
+ ALIGN8_READ16(this0, src, 0)
+#else
+ this0 = VEC_LD(inptr);
+#endif
+ this0e = _mm_and_si128(this0, even_mask);
+ this0o = _mm_srli_epi16(this0, 8);
+ outl = _mm_add_epi16(this0e, this0o);
+ outl = _mm_srli_epi16(_mm_add_epi16(outl, pw_bias), 1);
+
+ out = _mm_packus_epi16(outl, outl);
+ VEC_STL(outptr, out);
+ }
+ }
+}
+
+
+void jsimd_h2v2_downsample_e2k(JDIMENSION image_width, int max_v_samp_factor,
+ JDIMENSION v_samp_factor,
+ JDIMENSION width_in_blocks,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ int outcol;
+ JDIMENSION output_cols = width_in_blocks * DCTSIZE, outrow;
+ JSAMPROW inptr0, inptr1, outptr;
+
+ __m128i this0, next0, this1, next1, out;
+ __m128i this0e, this0o, next0e, next0o, this1e, this1o,
+ next1e, next1o, out0l, out0h, out1l, out1h, outl, outh;
+
+ /* Constants */
+ __m128i pw_bias = _mm_set1_epi32(1 | 2 << 16),
+ even_mask = _mm_set1_epi16(255);
+#ifdef NEED_ALIGN8
+ ALIGN8_COMMON
+ ALIGN8_VARS(src0)
+ ALIGN8_VARS(src1)
+#endif
+
+ expand_right_edge(input_data, max_v_samp_factor,
+ image_width, output_cols * 2);
+
+ if (output_cols > 0)
+ for (outrow = 0; outrow < v_samp_factor; outrow++) {
+ inptr0 = input_data[outrow * 2];
+ inptr1 = input_data[outrow * 2 + 1];
+ outptr = output_data[outrow];
+
+#ifdef NEED_ALIGN8
+ ALIGN8_START(inptr0, src0)
+ ALIGN8_START(inptr1, src1)
+#endif
+ PRAGMA_E2K("ivdep")
+ for (outcol = output_cols; outcol > 8;
+ outcol -= 16, outptr += 16) {
+
+#ifdef NEED_ALIGN8
+ ALIGN8_READ16(this0, src0, 0) src0_ptr += 2;
+ ALIGN8_READ16(this1, src1, 0) src1_ptr += 2;
+#else
+ this0 = VEC_LD(inptr0); inptr0 += 16;
+ this1 = VEC_LD(inptr1); inptr1 += 16;
+#endif
+ this0e = _mm_and_si128(this0, even_mask);
+ this1e = _mm_and_si128(this1, even_mask);
+ this0o = _mm_srli_epi16(this0, 8);
+ this1o = _mm_srli_epi16(this1, 8);
+ out0l = _mm_add_epi16(this0e, this0o);
+ out1l = _mm_add_epi16(this1e, this1o);
+
+ outl = _mm_add_epi16(out0l, out1l);
+ outl = _mm_srli_epi16(_mm_add_epi16(outl, pw_bias), 2);
+
+#ifdef NEED_ALIGN8
+ ALIGN8_READ16(next0, src0, 0) src0_ptr += 2;
+ ALIGN8_READ16(next1, src1, 0) src1_ptr += 2;
+#else
+ next0 = VEC_LD(inptr0); inptr0 += 16;
+ next1 = VEC_LD(inptr1); inptr1 += 16;
+#endif
+ next0e = _mm_and_si128(next0, even_mask);
+ next1e = _mm_and_si128(next1, even_mask);
+ next0o = _mm_srli_epi16(next0, 8);
+ next1o = _mm_srli_epi16(next1, 8);
+ out0h = _mm_add_epi16(next0e, next0o);
+ out1h = _mm_add_epi16(next1e, next1o);
+
+ outh = _mm_add_epi16(out0h, out1h);
+ outh = _mm_srli_epi16(_mm_add_epi16(outh, pw_bias), 2);
+
+ out = _mm_packus_epi16(outl, outh);
+ VEC_ST(outptr, out);
+ }
+ if (outcol > 0) {
+#ifdef NEED_ALIGN8
+ ALIGN8_READ16(this0, src0, 0)
+ ALIGN8_READ16(this1, src1, 0)
+#else
+ this0 = VEC_LD(inptr0);
+ this1 = VEC_LD(inptr1);
+#endif
+ this0e = _mm_and_si128(this0, even_mask);
+ this1e = _mm_and_si128(this1, even_mask);
+ this0o = _mm_srli_epi16(this0, 8);
+ this1o = _mm_srli_epi16(this1, 8);
+ out0l = _mm_add_epi16(this0e, this0o);
+ out1l = _mm_add_epi16(this1e, this1o);
+
+ outl = _mm_add_epi16(out0l, out1l);
+ outl = _mm_srli_epi16(_mm_add_epi16(outl, pw_bias), 2);
+
+ out = _mm_packus_epi16(outl, outl);
+ VEC_STL(outptr, out);
+ }
+ }
+}
diff --git a/simd/e2k/jcsample.h b/simd/e2k/jcsample.h
new file mode 100644
index 0000000..2ac4816
--- /dev/null
+++ b/simd/e2k/jcsample.h
@@ -0,0 +1,28 @@
+/*
+ * jcsample.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1996, Thomas G. Lane.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ */
+
+LOCAL(void)
+expand_right_edge(JSAMPARRAY image_data, int num_rows, JDIMENSION input_cols,
+ JDIMENSION output_cols)
+{
+ register JSAMPROW ptr;
+ register JSAMPLE pixval;
+ register int count;
+ int row;
+ int numcols = (int)(output_cols - input_cols);
+
+ if (numcols > 0) {
+ for (row = 0; row < num_rows; row++) {
+ ptr = image_data[row] + input_cols;
+ pixval = ptr[-1]; /* don't need GETJSAMPLE() here */
+ for (count = numcols; count > 0; count--)
+ *ptr++ = pixval;
+ }
+ }
+}
diff --git a/simd/e2k/jdcolext-e2k.c b/simd/e2k/jdcolext-e2k.c
new file mode 100644
index 0000000..4f12aef
--- /dev/null
+++ b/simd/e2k/jdcolext-e2k.c
@@ -0,0 +1,258 @@
+/*
+ * Elbrus optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdcolor-e2k.c */
+
+void ycc_rgbn_convert(JDIMENSION out_width, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows, int shuf_idx)
+{
+ JSAMPROW outptr, inptr0, inptr1, inptr2;
+ uint8_t __attribute__((aligned(16))) tmpbuf[PIXELSIZE * 16];
+
+ __m128i rgb0, rgb1, rgb2, rgb3, y, cb, cr;
+ __m128i rg0, rg1, bx0, bx1, yl, yh, cbl, cbh,
+ crl, crh, rl, rh, gl, gh, bl, bh, g0w, g1w, g2w, g3w;
+ __m128i g0, g1, g2, g3;
+
+ /* Constants
+ * NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17
+ * high-order bits, not 16.
+ */
+ __m128i pw_f0402 = _mm_set1_epi16(F_0_402 >> 1),
+ pw_mf0228 = _mm_set1_epi16(-F_0_228 >> 1),
+ pw_mf0344_f0285 = _mm_setr_epi16(__4X2(-F_0_344, F_0_285)),
+ pb_255 = _mm_set1_epi8(-1),
+ pw_cj = _mm_set1_epi16(CENTERJSAMPLE),
+ pd_onehalf = _mm_set1_epi32(ONE_HALF),
+ pb_zero = _mm_setzero_si128();
+ RGB_SHUFFLE_INIT
+#ifdef NEED_ALIGN8
+ ALIGN8_COMMON
+ ALIGN8_VARS(src0)
+ ALIGN8_VARS(src1)
+ ALIGN8_VARS(src2)
+#endif
+
+ if (out_width > 0)
+ while (--num_rows >= 0) {
+ int num_cols;
+ inptr0 = input_buf[0][input_row];
+ inptr1 = input_buf[1][input_row];
+ inptr2 = input_buf[2][input_row];
+ input_row++;
+ outptr = *output_buf++;
+
+ if (out_width >= 16) {
+#ifdef NEED_ALIGN8
+ ALIGN8_START(inptr0, src0)
+ ALIGN8_START(inptr1, src1)
+ ALIGN8_START(inptr2, src2)
+ inptr0 += out_width & -16;
+ inptr1 += out_width & -16;
+ inptr2 += out_width & -16;
+#endif
+ PRAGMA_E2K("ivdep")
+ for (num_cols = out_width; num_cols >= 16;
+ num_cols -= 16, outptr += PIXELSIZE * 16) {
+#ifdef NEED_ALIGN8
+ ALIGN8_READ16(y, src0, 0) src0_ptr += 2;
+ ALIGN8_READ16(cb, src1, 0) src1_ptr += 2;
+ ALIGN8_READ16(cr, src2, 0) src2_ptr += 2;
+#else
+ y = VEC_LD(inptr0); inptr0 += 16;
+ cb = VEC_LD(inptr1); inptr1 += 16;
+ cr = VEC_LD(inptr2); inptr2 += 16;
+#endif
+ CALC_RGB
+ RGB_SHUFFLE
+ VEC_ST(outptr, rgb0);
+ VEC_ST(outptr + 16, rgb1);
+ VEC_ST(outptr + 32, rgb2);
+#if PIXELSIZE == 4
+ VEC_ST(outptr + 48, rgb3);
+#endif
+ }
+ }
+
+ num_cols = out_width & 15;
+ if (num_cols) {
+ int i;
+ for (i = 0; i < num_cols; i++) {
+ tmpbuf[i] = inptr0[i];
+ tmpbuf[i + 16] = inptr1[i];
+ tmpbuf[i + 32] = inptr2[i];
+ }
+ y = VEC_LD(tmpbuf);
+ cb = VEC_LD(tmpbuf + 16);
+ cr = VEC_LD(tmpbuf + 32);
+ CALC_RGB
+ RGB_SHUFFLE
+ VEC_ST(tmpbuf, rgb0);
+ VEC_ST(tmpbuf + 16, rgb1);
+ VEC_ST(tmpbuf + 32, rgb2);
+#if PIXELSIZE == 4
+ VEC_ST(tmpbuf + 48, rgb3);
+#endif
+ memcpy(outptr, tmpbuf, num_cols * PIXELSIZE);
+ }
+ }
+}
+
+void ycc_rgbn_merged(JDIMENSION out_width, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr,
+ JDIMENSION in_row_group_ctr_y,
+ JSAMPARRAY output_buf, int shuf_idx)
+{
+ JSAMPROW outptr, inptr0, inptr1, inptr2;
+ int num_cols;
+ uint8_t __attribute__((aligned(16))) tmpbuf[4 * 16];
+
+ __m128i rgb0, rgb1, rgb2, rgb3, y, cb, cr;
+ __m128i rg0, rg1, bx0, bx1, yl, yh, cbl, cbh,
+ crl, crh, r_yl, r_yh, g_yl, g_yh, b_yl, b_yh, g_y0w, g_y1w, g_y2w, g_y3w,
+ rl, rh, gl, gh, bl, bh;
+ __m128i g_y0, g_y1, g_y2, g_y3;
+
+ /* Constants
+ * NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17
+ * high-order bits, not 16.
+ */
+ __m128i pw_f0402 = _mm_set1_epi16(F_0_402 >> 1),
+ pw_mf0228 = _mm_set1_epi16(-F_0_228 >> 1),
+ pw_mf0344_f0285 = _mm_setr_epi16(__4X2(-F_0_344, F_0_285)),
+ pb_255 = _mm_set1_epi8(-1),
+ pw_cj = _mm_set1_epi16(CENTERJSAMPLE),
+ pd_onehalf = _mm_set1_epi32(ONE_HALF),
+ pb_zero = _mm_setzero_si128();
+ RGB_SHUFFLE_INIT
+#ifdef NEED_ALIGN8
+ ALIGN8_COMMON
+ ALIGN8_VARS(src0)
+ ALIGN8_VARS(src1)
+ ALIGN8_VARS(src2)
+#endif
+
+ inptr0 = input_buf[0][in_row_group_ctr_y];
+ inptr1 = input_buf[1][in_row_group_ctr];
+ inptr2 = input_buf[2][in_row_group_ctr];
+ outptr = output_buf[0];
+
+ if (out_width >= 32) {
+#ifdef NEED_ALIGN8
+ ALIGN8_START(inptr0, src0)
+ ALIGN8_START(inptr1, src1)
+ ALIGN8_START(inptr2, src2)
+ inptr0 += out_width & -32;
+ inptr1 += (out_width & -32) >> 1;
+ inptr2 += (out_width & -32) >> 1;
+#endif
+ PRAGMA_E2K("ivdep")
+ for (num_cols = out_width; num_cols >= 32; num_cols -= 32) {
+
+#ifdef NEED_ALIGN8
+ ALIGN8_READ16(cb, src1, 0) src1_ptr += 2;
+ ALIGN8_READ16(cr, src2, 0) src2_ptr += 2;
+#else
+ cb = VEC_LD(inptr1); inptr1 += 16;
+ cr = VEC_LD(inptr2); inptr2 += 16;
+#endif
+ CALC_MERGED1
+
+#ifdef NEED_ALIGN8
+ ALIGN8_READ16(y, src0, 0) src0_ptr += 2;
+#else
+ y = VEC_LD(inptr0); inptr0 += 16;
+#endif
+ CALC_MERGED2(r_yl, g_yl, b_yl)
+ RGB_SHUFFLE
+ VEC_ST(outptr, rgb0);
+ VEC_ST(outptr + 16, rgb1);
+ VEC_ST(outptr + 32, rgb2);
+#if PIXELSIZE == 4
+ VEC_ST(outptr + 48, rgb3);
+#endif
+ outptr += PIXELSIZE * 16;
+
+#ifdef NEED_ALIGN8
+ ALIGN8_READ16(y, src0, 0) src0_ptr += 2;
+#else
+ y = VEC_LD(inptr0); inptr0 += 16;
+#endif
+ CALC_MERGED2(r_yh, g_yh, b_yh)
+ RGB_SHUFFLE
+ VEC_ST(outptr, rgb0);
+ VEC_ST(outptr + 16, rgb1);
+ VEC_ST(outptr + 32, rgb2);
+#if PIXELSIZE == 4
+ VEC_ST(outptr + 48, rgb3);
+#endif
+ outptr += PIXELSIZE * 16;
+ }
+ }
+
+ num_cols = out_width & 31;
+ if (num_cols) {
+ int i;
+ for (i = 0; i < (num_cols + 1) >> 1; i++) {
+ tmpbuf[i] = inptr1[i];
+ tmpbuf[i + 16] = inptr2[i];
+ tmpbuf[i * 2 + 32] = inptr0[i * 2];
+ tmpbuf[i * 2 + 32 + 1] = inptr0[i * 2 + 1];
+ }
+ cb = VEC_LD(tmpbuf);
+ cr = VEC_LD(tmpbuf + 16);
+ CALC_MERGED1
+
+ y = VEC_LD(tmpbuf + 32);
+ CALC_MERGED2(r_yl, g_yl, b_yl)
+ RGB_SHUFFLE
+ if (num_cols >= 16) {
+ VEC_ST(outptr, rgb0);
+ VEC_ST(outptr + 16, rgb1);
+ VEC_ST(outptr + 32, rgb2);
+#if PIXELSIZE == 4
+ VEC_ST(outptr + 48, rgb3);
+#endif
+ outptr += PIXELSIZE * 16;
+
+ y = VEC_LD(tmpbuf + 48);
+ CALC_MERGED2(r_yh, g_yh, b_yh)
+ RGB_SHUFFLE
+ }
+ VEC_ST(tmpbuf, rgb0);
+ VEC_ST(tmpbuf + 16, rgb1);
+ VEC_ST(tmpbuf + 32, rgb2);
+#if PIXELSIZE == 4
+ VEC_ST(tmpbuf + 48, rgb3);
+#endif
+ memcpy(outptr, tmpbuf, (out_width & 15) * PIXELSIZE);
+ }
+}
+
+#undef RGB_SHUFFLE_INIT
+#undef RGB_SHUFFLE
+#undef PIXELSIZE
+#undef ycc_rgbn_convert
+#undef ycc_rgbn_merged
+
diff --git a/simd/e2k/jdcolor-e2k.c b/simd/e2k/jdcolor-e2k.c
new file mode 100644
index 0000000..94c80e9
--- /dev/null
+++ b/simd/e2k/jdcolor-e2k.c
@@ -0,0 +1,289 @@
+/*
+ * Elbrus optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* YCC --> RGB CONVERSION */
+
+#include "jsimd_e2k.h"
+
+#define F_0_344 22554 /* FIX(0.34414) */
+#define F_0_714 46802 /* FIX(0.71414) */
+#define F_1_402 91881 /* FIX(1.40200) */
+#define F_1_772 116130 /* FIX(1.77200) */
+#define F_0_402 (F_1_402 - 65536) /* FIX(1.40200) - FIX(1) */
+#define F_0_285 (65536 - F_0_714) /* FIX(1) - FIX(0.71414) */
+#define F_0_228 (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */
+
+#define SCALEBITS 16
+#define ONE_HALF (1 << (SCALEBITS - 1))
+
+static const uint8_t __attribute__((aligned(16)))
+#if defined(__iset__) && __iset__ >= 5
+ycc_rgb_shuf_const[7][48] = {
+#define SHUF_CONST3 \
+ C0, C1, C2, \
+ C0 + 4, C1 + 4, C2 + 4, \
+ C0 + 8, C1 + 8, C2 + 8, \
+ C0 + 12, C1 + 12, C2 + 12, \
+ C0 + 16, C1 + 16, C2 + 16, \
+ C0 + 20, C1 + 4, C2 + 4, \
+ C0 + 8, C1 + 8, C2 + 8, \
+ C0 + 12, C1 + 12, C2 + 12, \
+ C0 + 16, C1 + 16, C2 + 16, \
+ C0 + 20, C1 + 20, C2 + 20, \
+ C0 + 24, C1 + 24, C2 + 8, \
+ C0 + 12, C1 + 12, C2 + 12, \
+ C0 + 16, C1 + 16, C2 + 16, \
+ C0 + 20, C1 + 20, C2 + 20, \
+ C0 + 24, C1 + 24, C2 + 24, \
+ C0 + 28, C1 + 28, C2 + 28
+#else
+ycc_rgb_shuf_const[7][24] = {
+#define SHUF_CONST3 \
+ C0, C1, C2, \
+ C0 + 4, C1 + 4, C2 + 4, \
+ C0 + 8, C1 + 8, C2, \
+ C0 + 4, C1 + 4, C2 + 4, \
+ C0 + 8, C1 + 8, C2 + 8, \
+ C0 + 12, C1 + 4, C2 + 4, \
+ C0 + 8, C1 + 8, C2 + 8, \
+ C0 + 12, C1 + 12, C2 + 12
+#endif
+
+#define SHUF_CONST4 C0, C1, C2, C3, C0 + 4, C1 + 4, C2 + 4, C3 + 4, \
+ C0 + 8, C1 + 8, C2 + 8, C3 + 8, C0 + 12, C1 + 12, C2 + 12, C3 + 12
+
+#define TMP_RED RGB_RED
+#define TMP_GREEN RGB_GREEN
+#define TMP_BLUE RGB_BLUE
+#define PIXELSIZE RGB_PIXELSIZE
+#include "jdcoltab-e2k.c"
+ ,
+#define TMP_RED EXT_RGB_RED
+#define TMP_GREEN EXT_RGB_GREEN
+#define TMP_BLUE EXT_RGB_BLUE
+#define PIXELSIZE EXT_RGB_PIXELSIZE
+#include "jdcoltab-e2k.c"
+ ,
+#define TMP_RED EXT_RGBX_RED
+#define TMP_GREEN EXT_RGBX_GREEN
+#define TMP_BLUE EXT_RGBX_BLUE
+#define PIXELSIZE EXT_RGBX_PIXELSIZE
+#include "jdcoltab-e2k.c"
+ ,
+#define TMP_RED EXT_BGR_RED
+#define TMP_GREEN EXT_BGR_GREEN
+#define TMP_BLUE EXT_BGR_BLUE
+#define PIXELSIZE EXT_BGR_PIXELSIZE
+#include "jdcoltab-e2k.c"
+ ,
+#define TMP_RED EXT_BGRX_RED
+#define TMP_GREEN EXT_BGRX_GREEN
+#define TMP_BLUE EXT_BGRX_BLUE
+#define PIXELSIZE EXT_BGRX_PIXELSIZE
+#include "jdcoltab-e2k.c"
+ ,
+#define TMP_RED EXT_XBGR_RED
+#define TMP_GREEN EXT_XBGR_GREEN
+#define TMP_BLUE EXT_XBGR_BLUE
+#define PIXELSIZE EXT_XBGR_PIXELSIZE
+#include "jdcoltab-e2k.c"
+ ,
+#define TMP_RED EXT_XRGB_RED
+#define TMP_GREEN EXT_XRGB_GREEN
+#define TMP_BLUE EXT_XRGB_BLUE
+#define PIXELSIZE EXT_XRGB_PIXELSIZE
+#include "jdcoltab-e2k.c"
+};
+
+ /* (Original)
+ * R = Y + 1.40200 * Cr
+ * G = Y - 0.34414 * Cb - 0.71414 * Cr
+ * B = Y + 1.77200 * Cb
+ *
+ * (This implementation)
+ * R = Y + 0.40200 * Cr + Cr
+ * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+ * B = Y - 0.22800 * Cb + Cb + Cb
+ */
+
+#define CALC_RGB \
+ yl = _mm_unpacklo_epi8(y, pb_zero); \
+ yh = _mm_unpackhi_epi8(y, pb_zero); \
+ \
+ cbl = _mm_unpacklo_epi8(cb, pb_zero); \
+ cbh = _mm_unpackhi_epi8(cb, pb_zero); \
+ cbl = _mm_sub_epi16(cbl, pw_cj); \
+ cbh = _mm_sub_epi16(cbh, pw_cj); \
+ \
+ crl = _mm_unpacklo_epi8(cr, pb_zero); \
+ crh = _mm_unpackhi_epi8(cr, pb_zero); \
+ crl = _mm_sub_epi16(crl, pw_cj); \
+ crh = _mm_sub_epi16(crh, pw_cj); \
+ \
+ bl = _mm_mulhrs_epi16(cbl, pw_mf0228); \
+ bh = _mm_mulhrs_epi16(cbh, pw_mf0228); \
+ bl = _mm_add_epi16(bl, _mm_add_epi16(cbl, cbl)); \
+ bh = _mm_add_epi16(bh, _mm_add_epi16(cbh, cbh)); \
+ bl = _mm_add_epi16(bl, yl); \
+ bh = _mm_add_epi16(bh, yh); \
+ \
+ rl = _mm_mulhrs_epi16(crl, pw_f0402); \
+ rh = _mm_mulhrs_epi16(crh, pw_f0402); \
+ rl = _mm_add_epi16(rl, crl); \
+ rh = _mm_add_epi16(rh, crh); \
+ rl = _mm_add_epi16(rl, yl); \
+ rh = _mm_add_epi16(rh, yh); \
+ \
+ g0w = _mm_unpacklo_epi16(cbl, crl); \
+ g1w = _mm_unpackhi_epi16(cbl, crl); \
+ g0 = _mm_add_epi32(_mm_madd_epi16(g0w, pw_mf0344_f0285), pd_onehalf); \
+ g1 = _mm_add_epi32(_mm_madd_epi16(g1w, pw_mf0344_f0285), pd_onehalf); \
+ g2w = _mm_unpacklo_epi16(cbh, crh); \
+ g3w = _mm_unpackhi_epi16(cbh, crh); \
+ g2 = _mm_add_epi32(_mm_madd_epi16(g2w, pw_mf0344_f0285), pd_onehalf); \
+ g3 = _mm_add_epi32(_mm_madd_epi16(g3w, pw_mf0344_f0285), pd_onehalf); \
+ \
+ gl = _mm_packhi_epi32(g0, g1); \
+ gh = _mm_packhi_epi32(g2, g3); \
+ gl = _mm_sub_epi16(gl, crl); \
+ gh = _mm_sub_epi16(gh, crh); \
+ gl = _mm_add_epi16(gl, yl); \
+ gh = _mm_add_epi16(gh, yh); \
+ \
+ rl = _mm_packus_epi16(rl, rh); \
+ gl = _mm_packus_epi16(gl, gh); \
+ bl = _mm_packus_epi16(bl, bh); \
+ \
+ rg0 = _mm_unpacklo_epi8(rl, gl); \
+ rg1 = _mm_unpackhi_epi8(rl, gl); \
+ bx0 = _mm_unpacklo_epi8(bl, pb_255); \
+ bx1 = _mm_unpackhi_epi8(bl, pb_255); \
+ \
+ rgb0 = _mm_unpacklo_epi16(rg0, bx0); \
+ rgb1 = _mm_unpackhi_epi16(rg0, bx0); \
+ rgb2 = _mm_unpacklo_epi16(rg1, bx1); \
+ rgb3 = _mm_unpackhi_epi16(rg1, bx1);
+
+#define CALC_MERGED1 \
+ cbl = _mm_unpacklo_epi8(cb, pb_zero); \
+ cbh = _mm_unpackhi_epi8(cb, pb_zero); \
+ cbl = _mm_sub_epi16(cbl, pw_cj); \
+ cbh = _mm_sub_epi16(cbh, pw_cj); \
+ \
+ crl = _mm_unpacklo_epi8(cr, pb_zero); \
+ crh = _mm_unpackhi_epi8(cr, pb_zero); \
+ crl = _mm_sub_epi16(crl, pw_cj); \
+ crh = _mm_sub_epi16(crh, pw_cj); \
+ \
+ b_yl = _mm_mulhrs_epi16(cbl, pw_mf0228); \
+ b_yh = _mm_mulhrs_epi16(cbh, pw_mf0228); \
+ b_yl = _mm_add_epi16(b_yl, _mm_add_epi16(cbl, cbl)); \
+ b_yh = _mm_add_epi16(b_yh, _mm_add_epi16(cbh, cbh)); \
+ \
+ r_yl = _mm_mulhrs_epi16(crl, pw_f0402); \
+ r_yh = _mm_mulhrs_epi16(crh, pw_f0402); \
+ r_yl = _mm_add_epi16(r_yl, crl); \
+ r_yh = _mm_add_epi16(r_yh, crh); \
+ \
+ g_y0w = _mm_unpacklo_epi16(cbl, crl); \
+ g_y1w = _mm_unpackhi_epi16(cbl, crl); \
+ g_y0 = _mm_add_epi32(_mm_madd_epi16(g_y0w, pw_mf0344_f0285), pd_onehalf); \
+ g_y1 = _mm_add_epi32(_mm_madd_epi16(g_y1w, pw_mf0344_f0285), pd_onehalf); \
+ g_y2w = _mm_unpacklo_epi16(cbh, crh); \
+ g_y3w = _mm_unpackhi_epi16(cbh, crh); \
+ g_y2 = _mm_add_epi32(_mm_madd_epi16(g_y2w, pw_mf0344_f0285), pd_onehalf); \
+ g_y3 = _mm_add_epi32(_mm_madd_epi16(g_y3w, pw_mf0344_f0285), pd_onehalf); \
+ \
+ g_yl = _mm_packhi_epi32(g_y0, g_y1); \
+ g_yh = _mm_packhi_epi32(g_y2, g_y3); \
+ g_yl = _mm_sub_epi16(g_yl, crl); \
+ g_yh = _mm_sub_epi16(g_yh, crh);
+
+#define CALC_MERGED2(r_yl, g_yl, b_yl) \
+ yl = _mm_unpacklo_epi8(y, pb_zero); \
+ yh = _mm_unpackhi_epi8(y, pb_zero); \
+ bl = _mm_add_epi16(_mm_unpacklo_epi16(b_yl, b_yl), yl); \
+ bh = _mm_add_epi16(_mm_unpackhi_epi16(b_yl, b_yl), yh); \
+ rl = _mm_add_epi16(_mm_unpacklo_epi16(r_yl, r_yl), yl); \
+ rh = _mm_add_epi16(_mm_unpackhi_epi16(r_yl, r_yl), yh); \
+ gl = _mm_add_epi16(_mm_unpacklo_epi16(g_yl, g_yl), yl); \
+ gh = _mm_add_epi16(_mm_unpackhi_epi16(g_yl, g_yl), yh); \
+ rl = _mm_packus_epi16(rl, rh); \
+ gl = _mm_packus_epi16(gl, gh); \
+ bl = _mm_packus_epi16(bl, bh); \
+ \
+ rg0 = _mm_unpacklo_epi8(rl, gl); \
+ rg1 = _mm_unpackhi_epi8(rl, gl); \
+ bx0 = _mm_unpacklo_epi8(bl, pb_255); \
+ bx1 = _mm_unpackhi_epi8(bl, pb_255); \
+ \
+ rgb0 = _mm_unpacklo_epi16(rg0, bx0); \
+ rgb1 = _mm_unpackhi_epi16(rg0, bx0); \
+ rgb2 = _mm_unpacklo_epi16(rg1, bx1); \
+ rgb3 = _mm_unpackhi_epi16(rg1, bx1);
+
+#define PIXELSIZE 3
+#if defined(__iset__) && __iset__ >= 5
+#define RGB_SHUFFLE_INIT __m128i \
+ rgb_index0 = VEC_LD(ycc_rgb_shuf_const[shuf_idx]), \
+ rgb_index1 = VEC_LD(ycc_rgb_shuf_const[shuf_idx] + 16), \
+ rgb_index2 = VEC_LD(ycc_rgb_shuf_const[shuf_idx] + 32);
+#define RGB_SHUFFLE \
+ rgb0 = _mm_shuffle2_epi8(rgb0, rgb1, rgb_index0); \
+ rgb1 = _mm_shuffle2_epi8(rgb1, rgb2, rgb_index1); \
+ rgb2 = _mm_shuffle2_epi8(rgb2, rgb3, rgb_index2);
+#else
+#define RGB_SHUFFLE_INIT __m64 \
+ rgb_index0 = *(__m64*)ycc_rgb_shuf_const[shuf_idx], \
+ rgb_index1 = *(__m64*)(ycc_rgb_shuf_const[shuf_idx] + 8), \
+ rgb_index2 = *(__m64*)(ycc_rgb_shuf_const[shuf_idx] + 16);
+#define RGB_SHUFFLE { \
+ union { __m128i v; __m64 d[2]; } a = { rgb0 }, \
+ b = { rgb1 }, c = { rgb2 }, d = { rgb3 }; \
+ a.d[0] = _mm_shuffle2_pi8(a.d[0], a.d[1], rgb_index0); \
+ a.d[1] = _mm_shuffle2_pi8(a.d[1], b.d[0], rgb_index1); \
+ b.d[0] = _mm_shuffle2_pi8(b.d[0], b.d[1], rgb_index2); \
+ b.d[1] = _mm_shuffle2_pi8(c.d[0], c.d[1], rgb_index0); \
+ c.d[0] = _mm_shuffle2_pi8(c.d[1], d.d[0], rgb_index1); \
+ c.d[1] = _mm_shuffle2_pi8(d.d[0], d.d[1], rgb_index2); \
+ rgb0 = a.v; rgb1 = b.v; rgb2 = c.v; \
+}
+#endif
+
+#define ycc_rgbn_convert jsimd_ycc_rgb3_convert_e2k
+#define ycc_rgbn_merged jsimd_ycc_rgb3_merged_upsample_e2k
+#include "jdcolext-e2k.c"
+
+#define PIXELSIZE 4
+#define RGB_SHUFFLE_INIT __m128i \
+ rgb_index0 = VEC_LD(ycc_rgb_shuf_const[shuf_idx]);
+#define RGB_SHUFFLE \
+ rgb0 = _mm_shuffle_epi8(rgb0, rgb_index0); \
+ rgb1 = _mm_shuffle_epi8(rgb1, rgb_index0); \
+ rgb2 = _mm_shuffle_epi8(rgb2, rgb_index0); \
+ rgb3 = _mm_shuffle_epi8(rgb3, rgb_index0);
+
+#define ycc_rgbn_convert jsimd_ycc_rgb4_convert_e2k
+#define ycc_rgbn_merged jsimd_ycc_rgb4_merged_upsample_e2k
+#include "jdcolext-e2k.c"
+
diff --git a/simd/e2k/jdcoltab-e2k.c b/simd/e2k/jdcoltab-e2k.c
new file mode 100644
index 0000000..e19666d
--- /dev/null
+++ b/simd/e2k/jdcoltab-e2k.c
@@ -0,0 +1,80 @@
+/*
+ * Elbrus optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdcolor-e2k.c */
+
+#if TMP_RED == 0
+#define C0 0
+#elif TMP_GREEN == 0
+#define C0 1
+#elif TMP_BLUE == 0
+#define C0 2
+#else
+#define C0 3
+#endif
+
+#if TMP_RED == 1
+#define C1 0
+#elif TMP_GREEN == 1
+#define C1 1
+#elif TMP_BLUE == 1
+#define C1 2
+#else
+#define C1 3
+#endif
+
+#if TMP_RED == 2
+#define C2 0
+#elif TMP_GREEN == 2
+#define C2 1
+#elif TMP_BLUE == 2
+#define C2 2
+#else
+#define C2 3
+#endif
+
+#if TMP_RED == 3
+#define C3 0
+#elif TMP_GREEN == 3
+#define C3 1
+#elif TMP_BLUE == 3
+#define C3 2
+#else
+#define C3 3
+#endif
+
+#if PIXELSIZE == 3
+{ SHUF_CONST3 }
+#else
+{ SHUF_CONST4 }
+#endif
+
+#undef C0
+#undef C1
+#undef C2
+#undef C3
+
+#undef TMP_RED
+#undef TMP_GREEN
+#undef TMP_BLUE
+#undef PIXELSIZE
+
diff --git a/simd/e2k/jdsample-e2k.c b/simd/e2k/jdsample-e2k.c
new file mode 100644
index 0000000..572b3af
--- /dev/null
+++ b/simd/e2k/jdsample-e2k.c
@@ -0,0 +1,389 @@
+/*
+ * Elbrus optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* CHROMA UPSAMPLING */
+
+#include "jsimd_e2k.h"
+
+
+void jsimd_h2v1_fancy_upsample_e2k(int max_v_samp_factor,
+ JDIMENSION downsampled_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ JSAMPROW inptr, outptr;
+ int inrow, incol;
+
+ __m128i pb_zero = _mm_setzero_si128();
+ __m128i this0, last0, p_last0, next0 = pb_zero, p_next0, out;
+ __m128i this0l, this0h, last0l, last0h,
+ next0l, next0h, outle, outhe, outlo, outho;
+#ifdef NEED_ALIGN8
+ ALIGN8_COMMON
+ ALIGN8_VARS(src)
+#endif
+
+ /* Constants */
+ __m128i pw_three = _mm_set1_epi16(3),
+ next_index_lastcol = _mm_setr_epi8(
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15),
+ pw_one = _mm_set1_epi16(1), pw_two = _mm_set1_epi16(2);
+
+ if (downsampled_width > 0)
+ for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+ inptr = input_data[inrow];
+ outptr = output_data[inrow];
+
+ if (downsampled_width & 15)
+ inptr[downsampled_width] = inptr[downsampled_width - 1];
+
+#ifdef NEED_ALIGN8
+ ALIGN8_START(inptr, src)
+ ALIGN8_READ16(this0, src, 0)
+#else
+ this0 = VEC_LD(inptr);
+#endif
+ last0 = _mm_bslli_si128(this0, 15);
+
+ PRAGMA_E2K("ivdep")
+ for (incol = downsampled_width; incol > 0;
+ incol -= 16, outptr += 32) {
+
+ p_last0 = _mm_alignr_epi8(this0, last0, 15);
+ last0 = this0;
+
+ if (__builtin_expect(incol <= 16, 0))
+ p_next0 = _mm_shuffle_epi8(this0, next_index_lastcol);
+ else {
+#ifdef NEED_ALIGN8
+ ALIGN8_READ16(next0, src, 1) src_ptr += 2;
+#else
+ next0 = VEC_LD(inptr + 16); inptr += 16;
+#endif
+ p_next0 = _mm_alignr_epi8(next0, this0, 1);
+ }
+
+ this0l = _mm_mullo_epi16(_mm_unpacklo_epi8(this0, pb_zero), pw_three);
+ last0l = _mm_unpacklo_epi8(p_last0, pb_zero);
+ next0l = _mm_unpacklo_epi8(p_next0, pb_zero);
+ last0l = _mm_add_epi16(last0l, pw_one);
+ next0l = _mm_add_epi16(next0l, pw_two);
+
+ outle = _mm_add_epi16(this0l, last0l);
+ outlo = _mm_add_epi16(this0l, next0l);
+ outle = _mm_srli_epi16(outle, 2);
+ outlo = _mm_srli_epi16(outlo, 2);
+
+ out = _mm_or_si128(outle, _mm_slli_epi16(outlo, 8));
+ VEC_ST(outptr, out);
+
+ if (__builtin_expect(incol <= 8, 0)) break;
+
+ this0h = _mm_mullo_epi16(_mm_unpackhi_epi8(this0, pb_zero), pw_three);
+ last0h = _mm_unpackhi_epi8(p_last0, pb_zero);
+ next0h = _mm_unpackhi_epi8(p_next0, pb_zero);
+ last0h = _mm_add_epi16(last0h, pw_one);
+ next0h = _mm_add_epi16(next0h, pw_two);
+
+ outhe = _mm_add_epi16(this0h, last0h);
+ outho = _mm_add_epi16(this0h, next0h);
+ outhe = _mm_srli_epi16(outhe, 2);
+ outho = _mm_srli_epi16(outho, 2);
+
+ out = _mm_or_si128(outhe, _mm_slli_epi16(outho, 8));
+ VEC_ST(outptr + 16, out);
+
+ this0 = next0;
+ }
+ }
+}
+
+
+void jsimd_h2v2_fancy_upsample_e2k(int max_v_samp_factor,
+ JDIMENSION downsampled_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
+ int inrow, outrow, incol;
+#ifdef NEED_ALIGN8
+ ALIGN8_COMMON
+ ALIGN8_VARS(src_1)
+ ALIGN8_VARS(src0)
+ ALIGN8_VARS(src1)
+#endif
+
+ __m128i pb_zero = _mm_setzero_si128();
+ __m128i this_1, this0, this1, out;
+ __m128i this_1l, this_1h, this0l, this0h, this1l, this1h,
+ lastcolsum_1h, lastcolsum1h,
+ p_lastcolsum_1l, p_lastcolsum_1h, p_lastcolsum1l, p_lastcolsum1h,
+ thiscolsum_1l, thiscolsum_1h, thiscolsum1l, thiscolsum1h,
+ nextcolsum_1l = pb_zero, nextcolsum_1h = pb_zero,
+ nextcolsum1l = pb_zero, nextcolsum1h = pb_zero,
+ p_nextcolsum_1l, p_nextcolsum_1h, p_nextcolsum1l, p_nextcolsum1h,
+ tmpl, tmph, outle, outhe, outlo, outho;
+
+ /* Constants */
+ __m128i pw_three = _mm_set1_epi16(3),
+ pw_seven = _mm_set1_epi16(7), pw_eight = _mm_set1_epi16(8),
+ next_index_lastcol = _mm_setr_epi8(
+ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14, 15);
+
+ if (downsampled_width > 0)
+ for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
+
+ inptr_1 = input_data[inrow - 1];
+ inptr0 = input_data[inrow];
+ inptr1 = input_data[inrow + 1];
+ outptr0 = output_data[outrow++];
+ outptr1 = output_data[outrow++];
+
+ if (downsampled_width & 15) {
+ inptr_1[downsampled_width] = inptr_1[downsampled_width - 1];
+ inptr0[downsampled_width] = inptr0[downsampled_width - 1];
+ inptr1[downsampled_width] = inptr1[downsampled_width - 1];
+ }
+
+#ifdef NEED_ALIGN8
+ ALIGN8_START(inptr0, src0)
+ ALIGN8_START(inptr_1, src_1)
+ ALIGN8_START(inptr1, src1)
+ ALIGN8_READ16(this0, src0, 0)
+ ALIGN8_READ16(this_1, src_1, 0)
+ ALIGN8_READ16(this1, src1, 0)
+#else
+ this0 = VEC_LD(inptr0);
+ this_1 = VEC_LD(inptr_1);
+ this1 = VEC_LD(inptr1);
+#endif
+
+ this0l = _mm_unpacklo_epi8(this0, pb_zero);
+ this0h = _mm_unpackhi_epi8(this0, pb_zero);
+ this0l = _mm_mullo_epi16(this0l, pw_three);
+ this0h = _mm_mullo_epi16(this0h, pw_three);
+
+ this_1l = _mm_unpacklo_epi8(this_1, pb_zero);
+ this_1h = _mm_unpackhi_epi8(this_1, pb_zero);
+ thiscolsum_1l = _mm_add_epi16(this0l, this_1l);
+ thiscolsum_1h = _mm_add_epi16(this0h, this_1h);
+ lastcolsum_1h = _mm_bslli_si128(thiscolsum_1l, 14);;
+
+ this1l = _mm_unpacklo_epi8(this1, pb_zero);
+ this1h = _mm_unpackhi_epi8(this1, pb_zero);
+ thiscolsum1l = _mm_add_epi16(this0l, this1l);
+ thiscolsum1h = _mm_add_epi16(this0h, this1h);
+ lastcolsum1h = _mm_bslli_si128(thiscolsum1l, 14);
+
+ PRAGMA_E2K("ivdep")
+ for (incol = downsampled_width; incol > 0;
+ incol -= 16, outptr0 += 32, outptr1 += 32) {
+
+ p_lastcolsum_1l = _mm_alignr_epi8(thiscolsum_1l, lastcolsum_1h, 14);
+ p_lastcolsum_1h = _mm_alignr_epi8(thiscolsum_1h, thiscolsum_1l, 14);
+ p_lastcolsum1l = _mm_alignr_epi8(thiscolsum1l, lastcolsum1h, 14);
+ p_lastcolsum1h = _mm_alignr_epi8(thiscolsum1h, thiscolsum1l, 14);
+ lastcolsum_1h = thiscolsum_1h;
+ lastcolsum1h = thiscolsum1h;
+
+ if (__builtin_expect(incol <= 16, 0)) {
+ p_nextcolsum_1l = _mm_alignr_epi8(thiscolsum_1h, thiscolsum_1l, 2);
+ p_nextcolsum_1h = _mm_shuffle_epi8(thiscolsum_1h, next_index_lastcol);
+ p_nextcolsum1l = _mm_alignr_epi8(thiscolsum1h, thiscolsum1l, 2);
+ p_nextcolsum1h = _mm_shuffle_epi8(thiscolsum1h, next_index_lastcol);
+ } else {
+#ifdef NEED_ALIGN8
+ ALIGN8_READ16(this0, src0, 1) src0_ptr += 2;
+ ALIGN8_READ16(this_1, src_1, 1) src_1_ptr += 2;
+ ALIGN8_READ16(this1, src1, 1) src1_ptr += 2;
+#else
+ this0 = VEC_LD(inptr0 + 16); inptr0 += 16;
+ this_1 = VEC_LD(inptr_1 + 16); inptr_1 += 16;
+ this1 = VEC_LD(inptr1 + 16); inptr1 += 16;
+#endif
+ this0l = _mm_unpacklo_epi8(this0, pb_zero);
+ this0h = _mm_unpackhi_epi8(this0, pb_zero);
+ this0l = _mm_mullo_epi16(this0l, pw_three);
+ this0h = _mm_mullo_epi16(this0h, pw_three);
+
+ this_1l = _mm_unpacklo_epi8(this_1, pb_zero);
+ this_1h = _mm_unpackhi_epi8(this_1, pb_zero);
+ nextcolsum_1l = _mm_add_epi16(this0l, this_1l);
+ nextcolsum_1h = _mm_add_epi16(this0h, this_1h);
+ p_nextcolsum_1l = _mm_alignr_epi8(thiscolsum_1h, thiscolsum_1l, 2);
+ p_nextcolsum_1h = _mm_alignr_epi8(nextcolsum_1l, thiscolsum_1h, 2);
+
+ this1l = _mm_unpacklo_epi8(this1, pb_zero);
+ this1h = _mm_unpackhi_epi8(this1, pb_zero);
+ nextcolsum1l = _mm_add_epi16(this0l, this1l);
+ nextcolsum1h = _mm_add_epi16(this0h, this1h);
+ p_nextcolsum1l = _mm_alignr_epi8(thiscolsum1h, thiscolsum1l, 2);
+ p_nextcolsum1h = _mm_alignr_epi8(nextcolsum1l, thiscolsum1h, 2);
+ }
+
+ /* Process the upper row */
+
+ tmpl = _mm_mullo_epi16(thiscolsum_1l, pw_three);
+ outle = _mm_add_epi16(tmpl, p_lastcolsum_1l);
+ outle = _mm_add_epi16(outle, pw_eight);
+ outle = _mm_srli_epi16(outle, 4);
+
+ outlo = _mm_add_epi16(tmpl, p_nextcolsum_1l);
+ outlo = _mm_add_epi16(outlo, pw_seven);
+ outlo = _mm_srli_epi16(outlo, 4);
+
+ out = _mm_or_si128(outle, _mm_slli_epi16(outlo, 8));
+ VEC_ST(outptr0, out);
+
+ /* Process the lower row */
+
+ tmpl = _mm_mullo_epi16(thiscolsum1l, pw_three);
+ outle = _mm_add_epi16(tmpl, p_lastcolsum1l);
+ outle = _mm_add_epi16(outle, pw_eight);
+ outle = _mm_srli_epi16(outle, 4);
+
+ outlo = _mm_add_epi16(tmpl, p_nextcolsum1l);
+ outlo = _mm_add_epi16(outlo, pw_seven);
+ outlo = _mm_srli_epi16(outlo, 4);
+
+ out = _mm_or_si128(outle, _mm_slli_epi16(outlo, 8));
+ VEC_ST(outptr1, out);
+
+ if (__builtin_expect(incol <= 8, 0)) break;
+
+ tmph = _mm_mullo_epi16(thiscolsum_1h, pw_three);
+ outhe = _mm_add_epi16(tmph, p_lastcolsum_1h);
+ outhe = _mm_add_epi16(outhe, pw_eight);
+ outhe = _mm_srli_epi16(outhe, 4);
+
+ outho = _mm_add_epi16(tmph, p_nextcolsum_1h);
+ outho = _mm_add_epi16(outho, pw_seven);
+ outho = _mm_srli_epi16(outho, 4);
+
+ out = _mm_or_si128(outhe, _mm_slli_epi16(outho, 8));
+ VEC_ST(outptr0 + 16, out);
+
+ tmph = _mm_mullo_epi16(thiscolsum1h, pw_three);
+ outhe = _mm_add_epi16(tmph, p_lastcolsum1h);
+ outhe = _mm_add_epi16(outhe, pw_eight);
+ outhe = _mm_srli_epi16(outhe, 4);
+
+ outho = _mm_add_epi16(tmph, p_nextcolsum1h);
+ outho = _mm_add_epi16(outho, pw_seven);
+ outho = _mm_srli_epi16(outho, 4);
+
+ out = _mm_or_si128(outhe, _mm_slli_epi16(outho, 8));
+ VEC_ST(outptr1 + 16, out);
+
+ thiscolsum_1l = nextcolsum_1l; thiscolsum_1h = nextcolsum_1h;
+ thiscolsum1l = nextcolsum1l; thiscolsum1h = nextcolsum1h;
+ }
+ }
+}
+
+
+/* These are rarely used (mainly just for decompressing YCCK images) */
+
+void jsimd_h2v1_upsample_e2k(int max_v_samp_factor,
+ JDIMENSION out_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ JSAMPROW inptr, outptr;
+ int inrow, incol;
+
+ __m128i in, inl, inh;
+#ifdef NEED_ALIGN8
+ ALIGN8_COMMON
+ ALIGN8_VARS(src)
+#endif
+
+ if (out_width > 0)
+ for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+ inptr = input_data[inrow];
+ outptr = output_data[inrow];
+#ifdef NEED_ALIGN8
+ ALIGN8_START(inptr, src)
+#endif
+ PRAGMA_E2K("ivdep")
+ for (incol = out_width; incol > 0;
+ incol -= 32, outptr += 32) {
+#ifdef NEED_ALIGN8
+ ALIGN8_READ16(in, src, 0) src_ptr += 2;
+#else
+ in = VEC_LD(inptr); inptr += 16;
+#endif
+ inl = _mm_unpacklo_epi8(in, in);
+ inh = _mm_unpackhi_epi8(in, in);
+
+ VEC_ST(outptr, inl);
+ VEC_ST(outptr + 16, inh);
+ }
+ }
+}
+
+
+void jsimd_h2v2_upsample_e2k(int max_v_samp_factor,
+ JDIMENSION out_width,
+ JSAMPARRAY input_data,
+ JSAMPARRAY *output_data_ptr)
+{
+ JSAMPARRAY output_data = *output_data_ptr;
+ JSAMPROW inptr, outptr0, outptr1;
+ int inrow, outrow, incol;
+
+ __m128i in, inl, inh;
+#ifdef NEED_ALIGN8
+ ALIGN8_COMMON
+ ALIGN8_VARS(src)
+#endif
+
+ if (out_width > 0)
+ for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
+ inptr = input_data[inrow];
+ outptr0 = output_data[outrow++];
+ outptr1 = output_data[outrow++];
+#ifdef NEED_ALIGN8
+ ALIGN8_START(inptr, src)
+#endif
+ PRAGMA_E2K("ivdep")
+ for (incol = out_width; incol > 0;
+ incol -= 32, outptr0 += 32, outptr1 += 32) {
+#ifdef NEED_ALIGN8
+ ALIGN8_READ16(in, src, 0) src_ptr += 2;
+#else
+ in = VEC_LD(inptr); inptr += 16;
+#endif
+ inl = _mm_unpacklo_epi8(in, in);
+ inh = _mm_unpackhi_epi8(in, in);
+
+ VEC_ST(outptr0, inl);
+ VEC_ST(outptr1, inl);
+ VEC_ST(outptr0 + 16, inh);
+ VEC_ST(outptr1 + 16, inh);
+ }
+ }
+}
diff --git a/simd/e2k/jfdctflt-e2k.c b/simd/e2k/jfdctflt-e2k.c
new file mode 100644
index 0000000..e3c4d94
--- /dev/null
+++ b/simd/e2k/jfdctflt-e2k.c
@@ -0,0 +1,127 @@
+/*
+ * Elbrus optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* FLOAT FORWARD DCT */
+
+#include "jsimd_e2k.h"
+
+#define DO_FDCT(in, out) { \
+ tmp0 = _mm_add_ps(in##0, in##7); \
+ tmp7 = _mm_sub_ps(in##0, in##7); \
+ tmp1 = _mm_add_ps(in##1, in##6); \
+ tmp6 = _mm_sub_ps(in##1, in##6); \
+ tmp2 = _mm_add_ps(in##2, in##5); \
+ tmp5 = _mm_sub_ps(in##2, in##5); \
+ tmp3 = _mm_add_ps(in##3, in##4); \
+ tmp4 = _mm_sub_ps(in##3, in##4); \
+ \
+ /* Even part */ \
+ \
+ tmp10 = _mm_add_ps(tmp0, tmp3); \
+ tmp13 = _mm_sub_ps(tmp0, tmp3); \
+ tmp11 = _mm_add_ps(tmp1, tmp2); \
+ tmp12 = _mm_sub_ps(tmp1, tmp2); \
+ \
+ out##0 = _mm_add_ps(tmp10, tmp11); \
+ out##4 = _mm_sub_ps(tmp10, tmp11); \
+ \
+ z1 = _mm_mul_ps(_mm_add_ps(tmp12, tmp13), pd_f0707); \
+ out##2 = _mm_add_ps(tmp13, z1); \
+ out##6 = _mm_sub_ps(tmp13, z1); \
+ \
+ /* Odd part */ \
+ \
+ tmp10 = _mm_add_ps(tmp4, tmp5); \
+ tmp11 = _mm_add_ps(tmp5, tmp6); \
+ tmp12 = _mm_add_ps(tmp6, tmp7); \
+ \
+ z5 = _mm_mul_ps(_mm_sub_ps(tmp10, tmp12), pd_f0382); \
+ z2 = _mm_add_ps(_mm_mul_ps(tmp10, pd_f0541), z5); \
+ z4 = _mm_add_ps(_mm_mul_ps(tmp12, pd_f1306), z5); \
+ z3 = _mm_mul_ps(tmp11, pd_f0707); \
+ \
+ z11 = _mm_add_ps(tmp7, z3); \
+ z13 = _mm_sub_ps(tmp7, z3); \
+ \
+ out##5 = _mm_add_ps(z13, z2); \
+ out##3 = _mm_sub_ps(z13, z2); \
+ out##1 = _mm_add_ps(z11, z4); \
+ out##7 = _mm_sub_ps(z11, z4); \
+}
+
+#define LOAD_DATA(a, b, c, d, l, i) \
+ l##a = _mm_loadu_ps(data + a * 8 + i); \
+ l##b = _mm_loadu_ps(data + b * 8 + i); \
+ l##c = _mm_loadu_ps(data + c * 8 + i); \
+ l##d = _mm_loadu_ps(data + d * 8 + i);
+
+#define STORE_DATA(a, b, c, d, l, i) \
+ _mm_storeu_ps(data + a * 8 + i, l##a); \
+ _mm_storeu_ps(data + b * 8 + i, l##b); \
+ _mm_storeu_ps(data + c * 8 + i, l##c); \
+ _mm_storeu_ps(data + d * 8 + i, l##d);
+
+
+void jsimd_fdct_float_e2k(FAST_FLOAT *data)
+{
+ __m128 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
+ tmp10, tmp11, tmp12, tmp13, z1, z2, z3, z4, z5, z11, z13;
+ __m128 l0, l1, l2, l3, l4, l5, l6, l7;
+ __m128 h0, h1, h2, h3, h4, h5, h6, h7;
+ __m128 x0, x1, x2, x3, x4, x5, x6, x7;
+ __m128 y0, y1, y2, y3, y4, y5, y6, y7;
+
+ /* Constants */
+ __m128 pd_f0382 = _mm_set1_ps(0.382683433f),
+ pd_f0541 = _mm_set1_ps(0.541196100f),
+ pd_f0707 = _mm_set1_ps(0.707106781f),
+ pd_f1306 = _mm_set1_ps(1.306562965f);
+
+ /* Pass 1: process columns */
+
+ LOAD_DATA(0, 1, 2, 3, x, 0)
+ LOAD_DATA(0, 1, 2, 3, y, 4)
+ TRANSPOSE_FLOAT(x0, x1, x2, x3, l0, l1, l2, l3)
+ TRANSPOSE_FLOAT(y0, y1, y2, y3, l4, l5, l6, l7)
+ DO_FDCT(l, l);
+
+ LOAD_DATA(4, 5, 6, 7, x, 0)
+ LOAD_DATA(4, 5, 6, 7, y, 4)
+ TRANSPOSE_FLOAT(x4, x5, x6, x7, h0, h1, h2, h3)
+ TRANSPOSE_FLOAT(y4, y5, y6, y7, h4, h5, h6, h7)
+ DO_FDCT(h, h);
+
+ /* Pass 2: process rows */
+
+ TRANSPOSE_FLOAT(l0, l1, l2, l3, x0, x1, x2, x3)
+ TRANSPOSE_FLOAT(h0, h1, h2, h3, x4, x5, x6, x7)
+ DO_FDCT(x, x);
+ STORE_DATA(0, 1, 2, 3, x, 0)
+ STORE_DATA(4, 5, 6, 7, x, 0)
+
+ TRANSPOSE_FLOAT(l4, l5, l6, l7, y0, y1, y2, y3)
+ TRANSPOSE_FLOAT(h4, h5, h6, h7, y4, y5, y6, y7)
+ DO_FDCT(y, y);
+ STORE_DATA(0, 1, 2, 3, y, 4)
+ STORE_DATA(4, 5, 6, 7, y, 4)
+}
diff --git a/simd/e2k/jfdctfst-e2k.c b/simd/e2k/jfdctfst-e2k.c
new file mode 100644
index 0000000..9e58f05
--- /dev/null
+++ b/simd/e2k/jfdctfst-e2k.c
@@ -0,0 +1,145 @@
+/*
+ * Elbrus optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* FAST INTEGER FORWARD DCT */
+
+#include "jsimd_e2k.h"
+
+
+#define F_0_382 98 /* FIX(0.382683433) */
+#define F_0_541 139 /* FIX(0.541196100) */
+#define F_0_707 181 /* FIX(0.707106781) */
+#define F_1_306 334 /* FIX(1.306562965) */
+
+#define CONST_BITS 8
+#define PRE_MULTIPLY_SCALE_BITS 2
+#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+
+#define DO_FDCT() { \
+ /* Even part */ \
+ \
+ tmp10 = _mm_add_epi16(tmp0, tmp3); \
+ tmp13 = _mm_sub_epi16(tmp0, tmp3); \
+ tmp11 = _mm_add_epi16(tmp1, tmp2); \
+ tmp12 = _mm_sub_epi16(tmp1, tmp2); \
+ \
+ out0 = _mm_add_epi16(tmp10, tmp11); \
+ out4 = _mm_sub_epi16(tmp10, tmp11); \
+ \
+ z1 = _mm_add_epi16(tmp12, tmp13); \
+ z1 = _mm_slli_epi16(z1, PRE_MULTIPLY_SCALE_BITS); \
+ z1 = _mm_mulhi_epi16(z1, pw_0707); \
+ \
+ out2 = _mm_add_epi16(tmp13, z1); \
+ out6 = _mm_sub_epi16(tmp13, z1); \
+ \
+ /* Odd part */ \
+ \
+ tmp10 = _mm_add_epi16(tmp4, tmp5); \
+ tmp11 = _mm_add_epi16(tmp5, tmp6); \
+ tmp12 = _mm_add_epi16(tmp6, tmp7); \
+ \
+ tmp10 = _mm_slli_epi16(tmp10, PRE_MULTIPLY_SCALE_BITS); \
+ tmp12 = _mm_slli_epi16(tmp12, PRE_MULTIPLY_SCALE_BITS); \
+ z5 = _mm_sub_epi16(tmp10, tmp12); \
+ z5 = _mm_mulhi_epi16(z5, pw_0382); \
+ \
+ z2 = _mm_add_epi16(_mm_mulhi_epi16(tmp10, pw_0541), z5); \
+ z4 = _mm_add_epi16(_mm_mulhi_epi16(tmp12, pw_1306), z5); \
+ \
+ tmp11 = _mm_slli_epi16(tmp11, PRE_MULTIPLY_SCALE_BITS); \
+ z3 = _mm_mulhi_epi16(tmp11, pw_0707); \
+ \
+ z11 = _mm_add_epi16(tmp7, z3); \
+ z13 = _mm_sub_epi16(tmp7, z3); \
+ \
+ out5 = _mm_add_epi16(z13, z2); \
+ out3 = _mm_sub_epi16(z13, z2); \
+ out1 = _mm_add_epi16(z11, z4); \
+ out7 = _mm_sub_epi16(z11, z4); \
+}
+
+
+void jsimd_fdct_ifast_e2k(DCTELEM *data)
+{
+ __m128i row0, row1, row2, row3, row4, row5, row6, row7,
+ col0, col1, col2, col3, col4, col5, col6, col7,
+ tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
+ z1, z2, z3, z4, z5, z11, z13,
+ out0, out1, out2, out3, out4, out5, out6, out7;
+
+ /* Constants */
+ __m128i pw_0382 = _mm_set1_epi16(F_0_382 << CONST_SHIFT),
+ pw_0541 = _mm_set1_epi16(F_0_541 << CONST_SHIFT),
+ pw_0707 = _mm_set1_epi16(F_0_707 << CONST_SHIFT),
+ pw_1306 = _mm_set1_epi16(F_1_306 << CONST_SHIFT);
+
+ /* Pass 1: process rows */
+
+ row0 = VEC_LD(data + 0 * 8);
+ row1 = VEC_LD(data + 1 * 8);
+ row2 = VEC_LD(data + 2 * 8);
+ row3 = VEC_LD(data + 3 * 8);
+ row4 = VEC_LD(data + 4 * 8);
+ row5 = VEC_LD(data + 5 * 8);
+ row6 = VEC_LD(data + 6 * 8);
+ row7 = VEC_LD(data + 7 * 8);
+
+ TRANSPOSE(row, col);
+
+ tmp0 = _mm_add_epi16(col0, col7);
+ tmp7 = _mm_sub_epi16(col0, col7);
+ tmp1 = _mm_add_epi16(col1, col6);
+ tmp6 = _mm_sub_epi16(col1, col6);
+ tmp2 = _mm_add_epi16(col2, col5);
+ tmp5 = _mm_sub_epi16(col2, col5);
+ tmp3 = _mm_add_epi16(col3, col4);
+ tmp4 = _mm_sub_epi16(col3, col4);
+
+ DO_FDCT();
+
+ /* Pass 2: process columns */
+
+ TRANSPOSE(out, row);
+
+ tmp0 = _mm_add_epi16(row0, row7);
+ tmp7 = _mm_sub_epi16(row0, row7);
+ tmp1 = _mm_add_epi16(row1, row6);
+ tmp6 = _mm_sub_epi16(row1, row6);
+ tmp2 = _mm_add_epi16(row2, row5);
+ tmp5 = _mm_sub_epi16(row2, row5);
+ tmp3 = _mm_add_epi16(row3, row4);
+ tmp4 = _mm_sub_epi16(row3, row4);
+
+ DO_FDCT();
+
+ VEC_ST(data + 0 * 8, out0);
+ VEC_ST(data + 1 * 8, out1);
+ VEC_ST(data + 2 * 8, out2);
+ VEC_ST(data + 3 * 8, out3);
+ VEC_ST(data + 4 * 8, out4);
+ VEC_ST(data + 5 * 8, out5);
+ VEC_ST(data + 6 * 8, out6);
+ VEC_ST(data + 7 * 8, out7);
+}
diff --git a/simd/e2k/jfdctint-e2k.c b/simd/e2k/jfdctint-e2k.c
new file mode 100644
index 0000000..2200852
--- /dev/null
+++ b/simd/e2k/jfdctint-e2k.c
@@ -0,0 +1,255 @@
+/*
+ * Elbrus optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, 2020, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* ACCURATE INTEGER FORWARD DCT */
+
+#include "jsimd_e2k.h"
+
+
+#define F_0_298 2446 /* FIX(0.298631336) */
+#define F_0_390 3196 /* FIX(0.390180644) */
+#define F_0_541 4433 /* FIX(0.541196100) */
+#define F_0_765 6270 /* FIX(0.765366865) */
+#define F_0_899 7373 /* FIX(0.899976223) */
+#define F_1_175 9633 /* FIX(1.175875602) */
+#define F_1_501 12299 /* FIX(1.501321110) */
+#define F_1_847 15137 /* FIX(1.847759065) */
+#define F_1_961 16069 /* FIX(1.961570560) */
+#define F_2_053 16819 /* FIX(2.053119869) */
+#define F_2_562 20995 /* FIX(2.562915447) */
+#define F_3_072 25172 /* FIX(3.072711026) */
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2 (CONST_BITS + PASS1_BITS)
+
+
+#define DO_FDCT_COMMON(PASS) { \
+ /* (Original) \
+ * z1 = (tmp12 + tmp13) * 0.541196100; \
+ * data2 = z1 + tmp13 * 0.765366865; \
+ * data6 = z1 + tmp12 * -1.847759065; \
+ * \
+ * (This implementation) \
+ * data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; \
+ * data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); \
+ */ \
+ \
+ tmp1312l = _mm_unpacklo_epi16(tmp13, tmp12); \
+ tmp1312h = _mm_unpackhi_epi16(tmp13, tmp12); \
+ \
+ out2l = _mm_add_epi32(_mm_madd_epi16(tmp1312l, pw_f130_f054), pd_descale_p##PASS); \
+ out2h = _mm_add_epi32(_mm_madd_epi16(tmp1312h, pw_f130_f054), pd_descale_p##PASS); \
+ out6l = _mm_add_epi32(_mm_madd_epi16(tmp1312l, pw_f054_mf130), pd_descale_p##PASS); \
+ out6h = _mm_add_epi32(_mm_madd_epi16(tmp1312h, pw_f054_mf130), pd_descale_p##PASS); \
+ \
+ out2l = _mm_srai_epi32(out2l, DESCALE_P##PASS); \
+ out2h = _mm_srai_epi32(out2h, DESCALE_P##PASS); \
+ out6l = _mm_srai_epi32(out6l, DESCALE_P##PASS); \
+ out6h = _mm_srai_epi32(out6h, DESCALE_P##PASS); \
+ \
+ out2 = _mm_packs_epi32(out2l, out2h); \
+ out6 = _mm_packs_epi32(out6l, out6h); \
+ \
+ /* Odd part */ \
+ \
+ z3 = _mm_add_epi16(tmp4, tmp6); \
+ z4 = _mm_add_epi16(tmp5, tmp7); \
+ \
+ /* (Original) \
+ * z5 = (z3 + z4) * 1.175875602; \
+ * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \
+ * z3 += z5; z4 += z5; \
+ * \
+ * (This implementation) \
+ * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
+ * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
+ */ \
+ \
+ z34l = _mm_unpacklo_epi16(z3, z4); \
+ z34h = _mm_unpackhi_epi16(z3, z4); \
+ \
+ z3l = _mm_add_epi32(_mm_madd_epi16(z34l, pw_mf078_f117), pd_descale_p##PASS); \
+ z3h = _mm_add_epi32(_mm_madd_epi16(z34h, pw_mf078_f117), pd_descale_p##PASS); \
+ z4l = _mm_add_epi32(_mm_madd_epi16(z34l, pw_f117_f078), pd_descale_p##PASS); \
+ z4h = _mm_add_epi32(_mm_madd_epi16(z34h, pw_f117_f078), pd_descale_p##PASS); \
+ \
+ /* (Original) \
+ * z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; \
+ * tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; \
+ * tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; \
+ * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \
+ * data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; \
+ * data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; \
+ * \
+ * (This implementation) \
+ * tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; \
+ * tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; \
+ * tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); \
+ * tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); \
+ * data7 = tmp4 + z3; data5 = tmp5 + z4; \
+ * data3 = tmp6 + z3; data1 = tmp7 + z4; \
+ */ \
+ \
+ tmp47l = _mm_unpacklo_epi16(tmp4, tmp7); \
+ tmp47h = _mm_unpackhi_epi16(tmp4, tmp7); \
+ \
+ out7l = _mm_add_epi32(_mm_madd_epi16(tmp47l, pw_mf060_mf089), z3l); \
+ out7h = _mm_add_epi32(_mm_madd_epi16(tmp47h, pw_mf060_mf089), z3h); \
+ out1l = _mm_add_epi32(_mm_madd_epi16(tmp47l, pw_mf089_f060), z4l); \
+ out1h = _mm_add_epi32(_mm_madd_epi16(tmp47h, pw_mf089_f060), z4h); \
+ \
+ out7l = _mm_srai_epi32(out7l, DESCALE_P##PASS); \
+ out7h = _mm_srai_epi32(out7h, DESCALE_P##PASS); \
+ out1l = _mm_srai_epi32(out1l, DESCALE_P##PASS); \
+ out1h = _mm_srai_epi32(out1h, DESCALE_P##PASS); \
+ \
+ out7 = _mm_packs_epi32(out7l, out7h); \
+ out1 = _mm_packs_epi32(out1l, out1h); \
+ \
+ tmp56l = _mm_unpacklo_epi16(tmp5, tmp6); \
+ tmp56h = _mm_unpackhi_epi16(tmp5, tmp6); \
+ \
+ out5l = _mm_add_epi32(_mm_madd_epi16(tmp56l, pw_mf050_mf256), z4l); \
+ out5h = _mm_add_epi32(_mm_madd_epi16(tmp56h, pw_mf050_mf256), z4h); \
+ out3l = _mm_add_epi32(_mm_madd_epi16(tmp56l, pw_mf256_f050), z3l); \
+ out3h = _mm_add_epi32(_mm_madd_epi16(tmp56h, pw_mf256_f050), z3h); \
+ \
+ out5l = _mm_srai_epi32(out5l, DESCALE_P##PASS); \
+ out5h = _mm_srai_epi32(out5h, DESCALE_P##PASS); \
+ out3l = _mm_srai_epi32(out3l, DESCALE_P##PASS); \
+ out3h = _mm_srai_epi32(out3h, DESCALE_P##PASS); \
+ \
+ out5 = _mm_packs_epi32(out5l, out5h); \
+ out3 = _mm_packs_epi32(out3l, out3h); \
+}
+
+#define DO_FDCT_PASS1() { \
+ /* Even part */ \
+ \
+ tmp10 = _mm_add_epi16(tmp0, tmp3); \
+ tmp13 = _mm_sub_epi16(tmp0, tmp3); \
+ tmp11 = _mm_add_epi16(tmp1, tmp2); \
+ tmp12 = _mm_sub_epi16(tmp1, tmp2); \
+ \
+ out0 = _mm_add_epi16(tmp10, tmp11); \
+ out0 = _mm_slli_epi16(out0, PASS1_BITS); \
+ out4 = _mm_sub_epi16(tmp10, tmp11); \
+ out4 = _mm_slli_epi16(out4, PASS1_BITS); \
+ \
+ DO_FDCT_COMMON(1); \
+}
+
+#define DO_FDCT_PASS2() { \
+ /* Even part */ \
+ \
+ tmp10 = _mm_add_epi16(tmp0, tmp3); \
+ tmp13 = _mm_sub_epi16(tmp0, tmp3); \
+ tmp11 = _mm_add_epi16(tmp1, tmp2); \
+ tmp12 = _mm_sub_epi16(tmp1, tmp2); \
+ \
+ out0 = _mm_add_epi16(tmp10, tmp11); \
+ out0 = _mm_add_epi16(out0, pw_descale_p2x); \
+ out0 = _mm_srai_epi16(out0, PASS1_BITS); \
+ out4 = _mm_sub_epi16(tmp10, tmp11); \
+ out4 = _mm_add_epi16(out4, pw_descale_p2x); \
+ out4 = _mm_srai_epi16(out4, PASS1_BITS); \
+ \
+ DO_FDCT_COMMON(2); \
+}
+
+
+void jsimd_fdct_islow_e2k(DCTELEM *data)
+{
+ __m128i row0, row1, row2, row3, row4, row5, row6, row7,
+ col0, col1, col2, col3, col4, col5, col6, col7,
+ tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
+ tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h,
+ z3, z4, z34l, z34h,
+ out0, out1, out2, out3, out4, out5, out6, out7;
+ __m128i z3l, z3h, z4l, z4h,
+ out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h,
+ out7l, out7h;
+
+ /* Constants */
+ __m128i pw_f130_f054 = _mm_setr_epi16(__4X2(F_0_541 + F_0_765, F_0_541)),
+ pw_f054_mf130 = _mm_setr_epi16(__4X2(F_0_541, F_0_541 - F_1_847)),
+ pw_mf078_f117 = _mm_setr_epi16(__4X2(F_1_175 - F_1_961, F_1_175)),
+ pw_f117_f078 = _mm_setr_epi16(__4X2(F_1_175, F_1_175 - F_0_390)),
+ pw_mf060_mf089 = _mm_setr_epi16(__4X2(F_0_298 - F_0_899, -F_0_899)),
+ pw_mf089_f060 = _mm_setr_epi16(__4X2(-F_0_899, F_1_501 - F_0_899)),
+ pw_mf050_mf256 = _mm_setr_epi16(__4X2(F_2_053 - F_2_562, -F_2_562)),
+ pw_mf256_f050 = _mm_setr_epi16(__4X2(-F_2_562, F_3_072 - F_2_562)),
+ pw_descale_p2x = _mm_set1_epi16(1 << (PASS1_BITS - 1)),
+ pd_descale_p1 = _mm_set1_epi32(1 << (DESCALE_P1 - 1)),
+ pd_descale_p2 = _mm_set1_epi32(1 << (DESCALE_P2 - 1));
+
+ /* Pass 1: process rows */
+
+ row0 = VEC_LD(data + 0 * 8);
+ row1 = VEC_LD(data + 1 * 8);
+ row2 = VEC_LD(data + 2 * 8);
+ row3 = VEC_LD(data + 3 * 8);
+ row4 = VEC_LD(data + 4 * 8);
+ row5 = VEC_LD(data + 5 * 8);
+ row6 = VEC_LD(data + 6 * 8);
+ row7 = VEC_LD(data + 7 * 8);
+
+ TRANSPOSE(row, col);
+
+ tmp0 = _mm_add_epi16(col0, col7);
+ tmp7 = _mm_sub_epi16(col0, col7);
+ tmp1 = _mm_add_epi16(col1, col6);
+ tmp6 = _mm_sub_epi16(col1, col6);
+ tmp2 = _mm_add_epi16(col2, col5);
+ tmp5 = _mm_sub_epi16(col2, col5);
+ tmp3 = _mm_add_epi16(col3, col4);
+ tmp4 = _mm_sub_epi16(col3, col4);
+
+ DO_FDCT_PASS1();
+
+ /* Pass 2: process columns */
+
+ TRANSPOSE(out, row);
+
+ tmp0 = _mm_add_epi16(row0, row7);
+ tmp7 = _mm_sub_epi16(row0, row7);
+ tmp1 = _mm_add_epi16(row1, row6);
+ tmp6 = _mm_sub_epi16(row1, row6);
+ tmp2 = _mm_add_epi16(row2, row5);
+ tmp5 = _mm_sub_epi16(row2, row5);
+ tmp3 = _mm_add_epi16(row3, row4);
+ tmp4 = _mm_sub_epi16(row3, row4);
+
+ DO_FDCT_PASS2();
+
+ VEC_ST(data + 0 * 8, out0);
+ VEC_ST(data + 1 * 8, out1);
+ VEC_ST(data + 2 * 8, out2);
+ VEC_ST(data + 3 * 8, out3);
+ VEC_ST(data + 4 * 8, out4);
+ VEC_ST(data + 5 * 8, out5);
+ VEC_ST(data + 6 * 8, out6);
+ VEC_ST(data + 7 * 8, out7);
+}
diff --git a/simd/e2k/jidctflt-e2k.c b/simd/e2k/jidctflt-e2k.c
new file mode 100644
index 0000000..7682965
--- /dev/null
+++ b/simd/e2k/jidctflt-e2k.c
@@ -0,0 +1,215 @@
+/*
+ * Elbrus optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* FLOAT INVERSE DCT */
+
+#include "jsimd_e2k.h"
+
+#define DO_IDCT(in, out) { \
+ /* Even part */ \
+ \
+ tmp10 = _mm_add_ps(in##0, in##4); \
+ tmp11 = _mm_sub_ps(in##0, in##4); \
+ \
+ tmp13 = _mm_add_ps(in##2, in##6); \
+ tmp12 = _mm_sub_ps(in##2, in##6); \
+ tmp12 = _mm_sub_ps(_mm_mul_ps(tmp12, pd_f1414), tmp13); \
+ \
+ tmp0 = _mm_add_ps(tmp10, tmp13); \
+ tmp3 = _mm_sub_ps(tmp10, tmp13); \
+ tmp1 = _mm_add_ps(tmp11, tmp12); \
+ tmp2 = _mm_sub_ps(tmp11, tmp12); \
+ \
+ /* Odd part */ \
+ \
+ z13 = _mm_add_ps(in##5, in##3); \
+ z10 = _mm_sub_ps(in##5, in##3); \
+ z11 = _mm_add_ps(in##1, in##7); \
+ z12 = _mm_sub_ps(in##1, in##7); \
+ \
+ tmp7 = _mm_add_ps(z11, z13); \
+ tmp11 = _mm_sub_ps(z11, z13); \
+ tmp11 = _mm_mul_ps(tmp11, pd_f1414); \
+ \
+ z5 = _mm_mul_ps(_mm_add_ps(z10, z12), pd_f1847); \
+ tmp10 = _mm_sub_ps(z5, _mm_mul_ps(z12, pd_f1082)); \
+ tmp12 = _mm_sub_ps(z5, _mm_mul_ps(z10, pd_f2613)); \
+ \
+ tmp6 = _mm_sub_ps(tmp12, tmp7); \
+ tmp5 = _mm_sub_ps(tmp11, tmp6); \
+ tmp4 = _mm_sub_ps(tmp10, tmp5); \
+ \
+ out##0 = _mm_add_ps(tmp0, tmp7); \
+ out##7 = _mm_sub_ps(tmp0, tmp7); \
+ out##1 = _mm_add_ps(tmp1, tmp6); \
+ out##6 = _mm_sub_ps(tmp1, tmp6); \
+ out##2 = _mm_add_ps(tmp2, tmp5); \
+ out##5 = _mm_sub_ps(tmp2, tmp5); \
+ out##3 = _mm_add_ps(tmp3, tmp4); \
+ out##4 = _mm_sub_ps(tmp3, tmp4); \
+}
+
+#define QUANT_MUL(a, b, c, d, l, lo, i) \
+ out0 = _mm_srai_epi32(_mm_unpack##lo##_epi16(col##a, col##a), 16); \
+ out1 = _mm_srai_epi32(_mm_unpack##lo##_epi16(col##b, col##b), 16); \
+ out2 = _mm_srai_epi32(_mm_unpack##lo##_epi16(col##c, col##c), 16); \
+ out3 = _mm_srai_epi32(_mm_unpack##lo##_epi16(col##d, col##d), 16); \
+ l##a = _mm_cvtepi32_ps(out0); \
+ l##b = _mm_cvtepi32_ps(out1); \
+ l##c = _mm_cvtepi32_ps(out2); \
+ l##d = _mm_cvtepi32_ps(out3); \
+ l##a = _mm_mul_ps(l##a, _mm_load_ps(dct_table + a * 8 + i)); \
+ l##b = _mm_mul_ps(l##b, _mm_load_ps(dct_table + b * 8 + i)); \
+ l##c = _mm_mul_ps(l##c, _mm_load_ps(dct_table + c * 8 + i)); \
+ l##d = _mm_mul_ps(l##d, _mm_load_ps(dct_table + d * 8 + i));
+
+
+void jsimd_idct_float_e2k(void *dct_table_, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ float *dct_table = (float *)dct_table_;
+
+ __m128i col0, col1, col2, col3, col4, col5, col6, col7,
+ out0, out1, out2, out3, out4, out5, out6, out7, row0, row1, row2, row3;
+ __m128 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
+ tmp10, tmp11, tmp12, tmp13, z5, z10, z11, z12, z13;
+ __m128 l0, l1, l2, l3, l4, l5, l6, l7;
+ __m128 h0, h1, h2, h3, h4, h5, h6, h7;
+ __m128 x0, x1, x2, x3, x4, x5, x6, x7;
+ __m128 y0, y1, y2, y3, y4, y5, y6, y7;
+
+ /* Constants */
+ __m128 pd_f1414 = _mm_set1_ps(1.414213562f),
+ pd_f1847 = _mm_set1_ps(1.847759065f),
+ pd_f1082 = _mm_set1_ps(1.082392200f),
+ pd_f2613 = _mm_set1_ps(2.613125930f);
+
+ /* Pass 1: process columns */
+
+ col0 = VEC_LD(coef_block + 0 * 8);
+ col1 = VEC_LD(coef_block + 1 * 8);
+ col2 = VEC_LD(coef_block + 2 * 8);
+ col3 = VEC_LD(coef_block + 3 * 8);
+ col4 = VEC_LD(coef_block + 4 * 8);
+ col5 = VEC_LD(coef_block + 5 * 8);
+ col6 = VEC_LD(coef_block + 6 * 8);
+ col7 = VEC_LD(coef_block + 7 * 8);
+
+ out1 = _mm_or_si128(col1, col2);
+ out2 = _mm_or_si128(col3, col4);
+ out1 = _mm_or_si128(out1, out2);
+ out3 = _mm_or_si128(col5, col6);
+ out3 = _mm_or_si128(out3, col7);
+ out1 = _mm_or_si128(out1, out3);
+
+ if (VEC_ISZERO(out1)) {
+ /* AC terms all zero */
+
+ out0 = _mm_srai_epi32(_mm_unpacklo_epi16(col0, col0), 16);
+ out1 = _mm_srai_epi32(_mm_unpackhi_epi16(col0, col0), 16);
+ tmp0 = _mm_cvtepi32_ps(out0);
+ tmp1 = _mm_cvtepi32_ps(out1);
+ tmp0 = _mm_mul_ps(tmp0, _mm_load_ps(dct_table));
+ tmp1 = _mm_mul_ps(tmp1, _mm_load_ps(dct_table + 4));
+
+ l0 = h0 = _mm_shuffle_ps(tmp0, tmp0, 0x00);
+ l1 = h1 = _mm_shuffle_ps(tmp0, tmp0, 0x55);
+ l2 = h2 = _mm_shuffle_ps(tmp0, tmp0, 0xaa);
+ l3 = h3 = _mm_shuffle_ps(tmp0, tmp0, 0xff);
+ l4 = h4 = _mm_shuffle_ps(tmp1, tmp1, 0x00);
+ l5 = h5 = _mm_shuffle_ps(tmp1, tmp1, 0x55);
+ l6 = h6 = _mm_shuffle_ps(tmp1, tmp1, 0xaa);
+ l7 = h7 = _mm_shuffle_ps(tmp1, tmp1, 0xff);
+
+ } else {
+
+ QUANT_MUL(0, 2, 4, 6, l, lo, 0)
+ QUANT_MUL(1, 3, 5, 7, l, lo, 0)
+ DO_IDCT(l, x);
+
+ QUANT_MUL(0, 2, 4, 6, h, hi, 4)
+ QUANT_MUL(1, 3, 5, 7, h, hi, 4)
+ DO_IDCT(h, y);
+
+ TRANSPOSE_FLOAT(x0, x1, x2, x3, l0, l1, l2, l3)
+ TRANSPOSE_FLOAT(x4, x5, x6, x7, h0, h1, h2, h3)
+ TRANSPOSE_FLOAT(y0, y1, y2, y3, l4, l5, l6, l7)
+ TRANSPOSE_FLOAT(y4, y5, y6, y7, h4, h5, h6, h7)
+ }
+
+ /* Pass 2: process rows */
+
+ DO_IDCT(l, x);
+ DO_IDCT(h, y);
+
+#ifdef JSIMD_SAME_ROUNDING
+#define OUT_ROUND(i) \
+ tmp0 = _mm_add_ps(_mm_mul_ps(x##i, pd_f0125), pd_cj_rnd); \
+ tmp1 = _mm_add_ps(_mm_mul_ps(y##i, pd_f0125), pd_cj_rnd); \
+ out##i = _mm_packs_epi32(_mm_cvttps_epi32(tmp0), _mm_cvttps_epi32(tmp1));
+
+ {
+ __m128 pd_cj_rnd = _mm_set1_ps(0.5f + CENTERJSAMPLE),
+ pd_f0125 = _mm_set1_ps(0.125f);
+
+ OUT_ROUND(0) OUT_ROUND(1)
+ OUT_ROUND(2) OUT_ROUND(3)
+ OUT_ROUND(4) OUT_ROUND(5)
+ OUT_ROUND(6) OUT_ROUND(7)
+ }
+ row0 = _mm_packus_epi16(out0, out1);
+ row1 = _mm_packus_epi16(out2, out3);
+ row2 = _mm_packus_epi16(out4, out5);
+ row3 = _mm_packus_epi16(out6, out7);
+
+ TRANSPOSE8(row, col) TRANSPOSE8(col, row) TRANSPOSE8(row, col)
+#else /* faster, slightly differ in rounding */
+#define OUT_ROUND(i, a, b) out##i = _mm_blendv_epi8( \
+ _mm_slli_epi32(_mm_castps_si128(_mm_add_ps(b, pd_round)), 16), \
+ _mm_castps_si128(_mm_add_ps(a, pd_round)), pd_mask);
+
+ {
+ __m128i pd_mask = _mm_set1_epi32(0xffff);
+ __m128 pd_round = _mm_set1_ps((3 << 22 | CENTERJSAMPLE) * 8);
+
+ OUT_ROUND(0, x0, x4) OUT_ROUND(1, y0, y4)
+ OUT_ROUND(2, x1, x5) OUT_ROUND(3, y1, y5)
+ OUT_ROUND(4, x2, x6) OUT_ROUND(5, y2, y6)
+ OUT_ROUND(6, x3, x7) OUT_ROUND(7, y3, y7)
+ }
+ row0 = _mm_packus_epi16(out0, out1);
+ row1 = _mm_packus_epi16(out2, out3);
+ row2 = _mm_packus_epi16(out4, out5);
+ row3 = _mm_packus_epi16(out6, out7);
+
+ TRANSPOSE8(row, out) TRANSPOSE8(out, col)
+#endif
+ VEC_STL(output_buf[0] + output_col, col0);
+ VEC_STH(output_buf[1] + output_col, col0);
+ VEC_STL(output_buf[2] + output_col, col1);
+ VEC_STH(output_buf[3] + output_col, col1);
+ VEC_STL(output_buf[4] + output_col, col2);
+ VEC_STH(output_buf[5] + output_col, col2);
+ VEC_STL(output_buf[6] + output_col, col3);
+ VEC_STH(output_buf[7] + output_col, col3);
+}
diff --git a/simd/e2k/jidctfst-e2k.c b/simd/e2k/jidctfst-e2k.c
new file mode 100644
index 0000000..18bc425
--- /dev/null
+++ b/simd/e2k/jidctfst-e2k.c
@@ -0,0 +1,187 @@
+/*
+ * Elbrus optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* FAST INTEGER INVERSE DCT */
+
+#include "jsimd_e2k.h"
+
+
+#define F_1_082 277 /* FIX(1.082392200) */
+#define F_1_414 362 /* FIX(1.414213562) */
+#define F_1_847 473 /* FIX(1.847759065) */
+#define F_2_613 669 /* FIX(2.613125930) */
+#define F_1_613 (F_2_613 - 256) /* FIX(2.613125930) - FIX(1) */
+
+#define CONST_BITS 8
+#define PASS1_BITS 2
+#define PRE_MULTIPLY_SCALE_BITS 2
+#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+
+#define DO_IDCT(in) { \
+ /* Even part */ \
+ \
+ tmp10 = _mm_add_epi16(in##0, in##4); \
+ tmp11 = _mm_sub_epi16(in##0, in##4); \
+ tmp13 = _mm_add_epi16(in##2, in##6); \
+ \
+ tmp12 = _mm_sub_epi16(in##2, in##6); \
+ tmp12 = _mm_slli_epi16(tmp12, PRE_MULTIPLY_SCALE_BITS); \
+ tmp12 = _mm_mulhi_epi16(tmp12, pw_F1414); \
+ tmp12 = _mm_sub_epi16(tmp12, tmp13); \
+ \
+ tmp0 = _mm_add_epi16(tmp10, tmp13); \
+ tmp3 = _mm_sub_epi16(tmp10, tmp13); \
+ tmp1 = _mm_add_epi16(tmp11, tmp12); \
+ tmp2 = _mm_sub_epi16(tmp11, tmp12); \
+ \
+ /* Odd part */ \
+ \
+ z13 = _mm_add_epi16(in##5, in##3); \
+ z10 = _mm_sub_epi16(in##5, in##3); \
+ z10s = _mm_slli_epi16(z10, PRE_MULTIPLY_SCALE_BITS); \
+ z11 = _mm_add_epi16(in##1, in##7); \
+ z12s = _mm_sub_epi16(in##1, in##7); \
+ z12s = _mm_slli_epi16(z12s, PRE_MULTIPLY_SCALE_BITS); \
+ \
+ tmp11 = _mm_sub_epi16(z11, z13); \
+ tmp11 = _mm_slli_epi16(tmp11, PRE_MULTIPLY_SCALE_BITS); \
+ tmp11 = _mm_mulhi_epi16(tmp11, pw_F1414); \
+ \
+ tmp7 = _mm_add_epi16(z11, z13); \
+ \
+ /* To avoid overflow... \
+ * \
+ * (Original) \
+ * tmp12 = -2.613125930 * z10 + z5; \
+ * \
+ * (This implementation) \
+ * tmp12 = (-1.613125930 - 1) * z10 + z5; \
+ * = -1.613125930 * z10 - z10 + z5; \
+ */ \
+ \
+ z5 = _mm_add_epi16(z10s, z12s); \
+ z5 = _mm_mulhi_epi16(z5, pw_F1847); \
+ \
+ tmp10 = _mm_mulhi_epi16(z12s, pw_F1082); \
+ tmp10 = _mm_sub_epi16(tmp10, z5); \
+ tmp12 = _mm_add_epi16(_mm_mulhi_epi16(z10s, pw_MF1613), z5); \
+ tmp12 = _mm_sub_epi16(tmp12, z10); \
+ \
+ tmp6 = _mm_sub_epi16(tmp12, tmp7); \
+ tmp5 = _mm_sub_epi16(tmp11, tmp6); \
+ tmp4 = _mm_add_epi16(tmp10, tmp5); \
+ \
+ out0 = _mm_add_epi16(tmp0, tmp7); \
+ out1 = _mm_add_epi16(tmp1, tmp6); \
+ out2 = _mm_add_epi16(tmp2, tmp5); \
+ out3 = _mm_sub_epi16(tmp3, tmp4); \
+ out4 = _mm_add_epi16(tmp3, tmp4); \
+ out5 = _mm_sub_epi16(tmp2, tmp5); \
+ out6 = _mm_sub_epi16(tmp1, tmp6); \
+ out7 = _mm_sub_epi16(tmp0, tmp7); \
+}
+
+
+void jsimd_idct_ifast_e2k(void *dct_table_, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ short *dct_table = (short *)dct_table_;
+
+ __m128i row0, row1, row2, row3, row4, row5, row6, row7,
+ col0, col1, col2, col3, col4, col5, col6, col7,
+ quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
+ tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
+ z5, z10, z10s, z11, z12s, z13,
+ out0, out1, out2, out3, out4, out5, out6, out7;
+
+ /* Constants */
+ __m128i pw_F1414 = _mm_set1_epi16(F_1_414 << CONST_SHIFT),
+ pw_F1847 = _mm_set1_epi16(F_1_847 << CONST_SHIFT),
+ pw_MF1613 = _mm_set1_epi16(-F_1_613 << CONST_SHIFT),
+ pw_F1082 = _mm_set1_epi16(F_1_082 << CONST_SHIFT);
+
+ /* Pass 1: process columns */
+
+ col0 = VEC_LD(coef_block + 0 * 8);
+ col1 = VEC_LD(coef_block + 1 * 8);
+ col2 = VEC_LD(coef_block + 2 * 8);
+ col3 = VEC_LD(coef_block + 3 * 8);
+ col4 = VEC_LD(coef_block + 4 * 8);
+ col5 = VEC_LD(coef_block + 5 * 8);
+ col6 = VEC_LD(coef_block + 6 * 8);
+ col7 = VEC_LD(coef_block + 7 * 8);
+
+ tmp1 = _mm_or_si128(col1, col2);
+ tmp2 = _mm_or_si128(col3, col4);
+ tmp1 = _mm_or_si128(tmp1, tmp2);
+ tmp3 = _mm_or_si128(col5, col6);
+ tmp3 = _mm_or_si128(tmp3, col7);
+ tmp1 = _mm_or_si128(tmp1, tmp3);
+
+ quant0 = VEC_LD(dct_table);
+ col0 = _mm_mullo_epi16(col0, quant0);
+
+ if (VEC_ISZERO(tmp1)) {
+ /* AC terms all zero */
+
+ IDCT_SPLAT8(col0);
+
+ } else {
+
+ quant1 = VEC_LD(dct_table + 1 * 8);
+ quant2 = VEC_LD(dct_table + 2 * 8);
+ quant3 = VEC_LD(dct_table + 3 * 8);
+ quant4 = VEC_LD(dct_table + 4 * 8);
+ quant5 = VEC_LD(dct_table + 5 * 8);
+ quant6 = VEC_LD(dct_table + 6 * 8);
+ quant7 = VEC_LD(dct_table + 7 * 8);
+
+ col1 = _mm_mullo_epi16(col1, quant1);
+ col2 = _mm_mullo_epi16(col2, quant2);
+ col3 = _mm_mullo_epi16(col3, quant3);
+ col4 = _mm_mullo_epi16(col4, quant4);
+ col5 = _mm_mullo_epi16(col5, quant5);
+ col6 = _mm_mullo_epi16(col6, quant6);
+ col7 = _mm_mullo_epi16(col7, quant7);
+
+ DO_IDCT(col);
+
+ TRANSPOSE(out, row);
+ }
+
+ /* Pass 2: process rows */
+
+ DO_IDCT(row);
+
+ out0 = _mm_srai_epi16(out0, PASS1_BITS + 3);
+ out1 = _mm_srai_epi16(out1, PASS1_BITS + 3);
+ out2 = _mm_srai_epi16(out2, PASS1_BITS + 3);
+ out3 = _mm_srai_epi16(out3, PASS1_BITS + 3);
+ out4 = _mm_srai_epi16(out4, PASS1_BITS + 3);
+ out5 = _mm_srai_epi16(out5, PASS1_BITS + 3);
+ out6 = _mm_srai_epi16(out6, PASS1_BITS + 3);
+ out7 = _mm_srai_epi16(out7, PASS1_BITS + 3);
+
+ IDCT_SAVE();
+}
diff --git a/simd/e2k/jidctint-e2k.c b/simd/e2k/jidctint-e2k.c
new file mode 100644
index 0000000..7bb79c0
--- /dev/null
+++ b/simd/e2k/jidctint-e2k.c
@@ -0,0 +1,294 @@
+/*
+ * Elbrus optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, 2020, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* ACCURATE INTEGER INVERSE DCT */
+
+#include "jsimd_e2k.h"
+
+
+#define F_0_298 2446 /* FIX(0.298631336) */
+#define F_0_390 3196 /* FIX(0.390180644) */
+#define F_0_541 4433 /* FIX(0.541196100) */
+#define F_0_765 6270 /* FIX(0.765366865) */
+#define F_0_899 7373 /* FIX(0.899976223) */
+#define F_1_175 9633 /* FIX(1.175875602) */
+#define F_1_501 12299 /* FIX(1.501321110) */
+#define F_1_847 15137 /* FIX(1.847759065) */
+#define F_1_961 16069 /* FIX(1.961570560) */
+#define F_2_053 16819 /* FIX(2.053119869) */
+#define F_2_562 20995 /* FIX(2.562915447) */
+#define F_3_072 25172 /* FIX(3.072711026) */
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
+
+
+#define DO_IDCT(in, PASS) { \
+ /* Even part \
+ * \
+ * (Original) \
+ * z1 = (z2 + z3) * 0.541196100; \
+ * tmp2 = z1 + z3 * -1.847759065; \
+ * tmp3 = z1 + z2 * 0.765366865; \
+ * \
+ * (This implementation) \
+ * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \
+ * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \
+ */ \
+ \
+ in##26l = _mm_unpacklo_epi16(in##2, in##6); \
+ in##26h = _mm_unpackhi_epi16(in##2, in##6); \
+ \
+ tmp3l = _mm_madd_epi16(in##26l, pw_f130_f054); \
+ tmp3h = _mm_madd_epi16(in##26h, pw_f130_f054); \
+ tmp2l = _mm_madd_epi16(in##26l, pw_f054_mf130); \
+ tmp2h = _mm_madd_epi16(in##26h, pw_f054_mf130); \
+ \
+ tmp0 = _mm_add_epi16(in##0, in##4); \
+ tmp1 = _mm_sub_epi16(in##0, in##4); \
+ \
+ tmp0l = _mm_unpacklo_epi16(pw_zero, tmp0); \
+ tmp0h = _mm_unpackhi_epi16(pw_zero, tmp0); \
+ tmp0l = _mm_srai_epi32(tmp0l, 16 - CONST_BITS); \
+ tmp0h = _mm_srai_epi32(tmp0h, 16 - CONST_BITS); \
+ tmp0l = _mm_add_epi32(tmp0l, pd_descale_p##PASS); \
+ tmp0h = _mm_add_epi32(tmp0h, pd_descale_p##PASS); \
+ \
+ tmp10l = _mm_add_epi32(tmp0l, tmp3l); \
+ tmp10h = _mm_add_epi32(tmp0h, tmp3h); \
+ tmp13l = _mm_sub_epi32(tmp0l, tmp3l); \
+ tmp13h = _mm_sub_epi32(tmp0h, tmp3h); \
+ \
+ tmp1l = _mm_unpacklo_epi16(pw_zero, tmp1); \
+ tmp1h = _mm_unpackhi_epi16(pw_zero, tmp1); \
+ tmp1l = _mm_srai_epi32(tmp1l, 16 - CONST_BITS); \
+ tmp1h = _mm_srai_epi32(tmp1h, 16 - CONST_BITS); \
+ tmp1l = _mm_add_epi32(tmp1l, pd_descale_p##PASS); \
+ tmp1h = _mm_add_epi32(tmp1h, pd_descale_p##PASS); \
+ \
+ tmp11l = _mm_add_epi32(tmp1l, tmp2l); \
+ tmp11h = _mm_add_epi32(tmp1h, tmp2h); \
+ tmp12l = _mm_sub_epi32(tmp1l, tmp2l); \
+ tmp12h = _mm_sub_epi32(tmp1h, tmp2h); \
+ \
+ /* Odd part */ \
+ \
+ z3 = _mm_add_epi16(in##3, in##7); \
+ z4 = _mm_add_epi16(in##1, in##5); \
+ \
+ /* (Original) \
+ * z5 = (z3 + z4) * 1.175875602; \
+ * z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; \
+ * z3 += z5; z4 += z5; \
+ * \
+ * (This implementation) \
+ * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
+ * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
+ */ \
+ \
+ z34l = _mm_unpacklo_epi16(z3, z4); \
+ z34h = _mm_unpackhi_epi16(z3, z4); \
+ \
+ z3l = _mm_madd_epi16(z34l, pw_mf078_f117); \
+ z3h = _mm_madd_epi16(z34h, pw_mf078_f117); \
+ z4l = _mm_madd_epi16(z34l, pw_f117_f078); \
+ z4h = _mm_madd_epi16(z34h, pw_f117_f078); \
+ \
+ /* (Original) \
+ * z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; \
+ * tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; \
+ * tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; \
+ * z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; \
+ * tmp0 += z1 + z3; tmp1 += z2 + z4; \
+ * tmp2 += z2 + z3; tmp3 += z1 + z4; \
+ * \
+ * (This implementation) \
+ * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; \
+ * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; \
+ * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); \
+ * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); \
+ * tmp0 += z3; tmp1 += z4; \
+ * tmp2 += z3; tmp3 += z4; \
+ */ \
+ \
+ in##71l = _mm_unpacklo_epi16(in##7, in##1); \
+ in##71h = _mm_unpackhi_epi16(in##7, in##1); \
+ \
+ tmp0l = _mm_add_epi32(_mm_madd_epi16(in##71l, pw_mf060_mf089), z3l); \
+ tmp0h = _mm_add_epi32(_mm_madd_epi16(in##71h, pw_mf060_mf089), z3h); \
+ tmp3l = _mm_add_epi32(_mm_madd_epi16(in##71l, pw_mf089_f060), z4l); \
+ tmp3h = _mm_add_epi32(_mm_madd_epi16(in##71h, pw_mf089_f060), z4h); \
+ \
+ in##53l = _mm_unpacklo_epi16(in##5, in##3); \
+ in##53h = _mm_unpackhi_epi16(in##5, in##3); \
+ \
+ tmp1l = _mm_add_epi32(_mm_madd_epi16(in##53l, pw_mf050_mf256), z4l); \
+ tmp1h = _mm_add_epi32(_mm_madd_epi16(in##53h, pw_mf050_mf256), z4h); \
+ tmp2l = _mm_add_epi32(_mm_madd_epi16(in##53l, pw_mf256_f050), z3l); \
+ tmp2h = _mm_add_epi32(_mm_madd_epi16(in##53h, pw_mf256_f050), z3h); \
+ \
+ /* Final output stage */ \
+ \
+ out0l = _mm_add_epi32(tmp10l, tmp3l); \
+ out0h = _mm_add_epi32(tmp10h, tmp3h); \
+ out7l = _mm_sub_epi32(tmp10l, tmp3l); \
+ out7h = _mm_sub_epi32(tmp10h, tmp3h); \
+ \
+ out0l = _mm_srai_epi32(out0l, DESCALE_P##PASS); \
+ out0h = _mm_srai_epi32(out0h, DESCALE_P##PASS); \
+ out7l = _mm_srai_epi32(out7l, DESCALE_P##PASS); \
+ out7h = _mm_srai_epi32(out7h, DESCALE_P##PASS); \
+ \
+ out0 = _mm_packs_epi32(out0l, out0h); \
+ out7 = _mm_packs_epi32(out7l, out7h); \
+ \
+ out1l = _mm_add_epi32(tmp11l, tmp2l); \
+ out1h = _mm_add_epi32(tmp11h, tmp2h); \
+ out6l = _mm_sub_epi32(tmp11l, tmp2l); \
+ out6h = _mm_sub_epi32(tmp11h, tmp2h); \
+ \
+ out1l = _mm_srai_epi32(out1l, DESCALE_P##PASS); \
+ out1h = _mm_srai_epi32(out1h, DESCALE_P##PASS); \
+ out6l = _mm_srai_epi32(out6l, DESCALE_P##PASS); \
+ out6h = _mm_srai_epi32(out6h, DESCALE_P##PASS); \
+ \
+ out1 = _mm_packs_epi32(out1l, out1h); \
+ out6 = _mm_packs_epi32(out6l, out6h); \
+ \
+ out2l = _mm_add_epi32(tmp12l, tmp1l); \
+ out2h = _mm_add_epi32(tmp12h, tmp1h); \
+ out5l = _mm_sub_epi32(tmp12l, tmp1l); \
+ out5h = _mm_sub_epi32(tmp12h, tmp1h); \
+ \
+ out2l = _mm_srai_epi32(out2l, DESCALE_P##PASS); \
+ out2h = _mm_srai_epi32(out2h, DESCALE_P##PASS); \
+ out5l = _mm_srai_epi32(out5l, DESCALE_P##PASS); \
+ out5h = _mm_srai_epi32(out5h, DESCALE_P##PASS); \
+ \
+ out2 = _mm_packs_epi32(out2l, out2h); \
+ out5 = _mm_packs_epi32(out5l, out5h); \
+ \
+ out3l = _mm_add_epi32(tmp13l, tmp0l); \
+ out3h = _mm_add_epi32(tmp13h, tmp0h); \
+ out4l = _mm_sub_epi32(tmp13l, tmp0l); \
+ out4h = _mm_sub_epi32(tmp13h, tmp0h); \
+ \
+ out3l = _mm_srai_epi32(out3l, DESCALE_P##PASS); \
+ out3h = _mm_srai_epi32(out3h, DESCALE_P##PASS); \
+ out4l = _mm_srai_epi32(out4l, DESCALE_P##PASS); \
+ out4h = _mm_srai_epi32(out4h, DESCALE_P##PASS); \
+ \
+ out3 = _mm_packs_epi32(out3l, out3h); \
+ out4 = _mm_packs_epi32(out4l, out4h); \
+}
+
+
+void jsimd_idct_islow_e2k(void *dct_table_, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ short *dct_table = (short *)dct_table_;
+
+ __m128i row0, row1, row2, row3, row4, row5, row6, row7,
+ col0, col1, col2, col3, col4, col5, col6, col7,
+ quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
+ tmp0, tmp1, tmp2, tmp3, z3, z4,
+ z34l, z34h, col71l, col71h, col26l, col26h, col53l, col53h,
+ row71l, row71h, row26l, row26h, row53l, row53h,
+ out0, out1, out2, out3, out4, out5, out6, out7;
+ __m128i tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h,
+ tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h,
+ z3l, z3h, z4l, z4h,
+ out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h, out4l, out4h,
+ out5l, out5h, out6l, out6h, out7l, out7h;
+
+ /* Constants */
+ __m128i pw_zero = _mm_setzero_si128(),
+ pw_f130_f054 = _mm_setr_epi16(__4X2(F_0_541 + F_0_765, F_0_541)),
+ pw_f054_mf130 = _mm_setr_epi16(__4X2(F_0_541, F_0_541 - F_1_847)),
+ pw_mf078_f117 = _mm_setr_epi16(__4X2(F_1_175 - F_1_961, F_1_175)),
+ pw_f117_f078 = _mm_setr_epi16(__4X2(F_1_175, F_1_175 - F_0_390)),
+ pw_mf060_mf089 = _mm_setr_epi16(__4X2(F_0_298 - F_0_899, -F_0_899)),
+ pw_mf089_f060 = _mm_setr_epi16(__4X2(-F_0_899, F_1_501 - F_0_899)),
+ pw_mf050_mf256 = _mm_setr_epi16(__4X2(F_2_053 - F_2_562, -F_2_562)),
+ pw_mf256_f050 = _mm_setr_epi16(__4X2(-F_2_562, F_3_072 - F_2_562)),
+ pd_descale_p1 = _mm_set1_epi32(1 << (DESCALE_P1 - 1)),
+ pd_descale_p2 = _mm_set1_epi32(1 << (DESCALE_P2 - 1));
+
+ /* Pass 1: process columns */
+
+ col0 = VEC_LD(coef_block + 0 * 8);
+ col1 = VEC_LD(coef_block + 1 * 8);
+ col2 = VEC_LD(coef_block + 2 * 8);
+ col3 = VEC_LD(coef_block + 3 * 8);
+ col4 = VEC_LD(coef_block + 4 * 8);
+ col5 = VEC_LD(coef_block + 5 * 8);
+ col6 = VEC_LD(coef_block + 6 * 8);
+ col7 = VEC_LD(coef_block + 7 * 8);
+
+ tmp1 = _mm_or_si128(col1, col2);
+ tmp2 = _mm_or_si128(col3, col4);
+ tmp1 = _mm_or_si128(tmp1, tmp2);
+ tmp3 = _mm_or_si128(col5, col6);
+ tmp3 = _mm_or_si128(tmp3, col7);
+ tmp1 = _mm_or_si128(tmp1, tmp3);
+
+ quant0 = VEC_LD(dct_table);
+ col0 = _mm_mullo_epi16(col0, quant0);
+
+ if (VEC_ISZERO(tmp1)) {
+ /* AC terms all zero */
+
+ col0 = _mm_slli_epi16(col0, PASS1_BITS);
+ IDCT_SPLAT8(col0);
+
+ } else {
+
+ quant1 = VEC_LD(dct_table + 1 * 8);
+ quant2 = VEC_LD(dct_table + 2 * 8);
+ quant3 = VEC_LD(dct_table + 3 * 8);
+ quant4 = VEC_LD(dct_table + 4 * 8);
+ quant5 = VEC_LD(dct_table + 5 * 8);
+ quant6 = VEC_LD(dct_table + 6 * 8);
+ quant7 = VEC_LD(dct_table + 7 * 8);
+
+ col1 = _mm_mullo_epi16(col1, quant1);
+ col2 = _mm_mullo_epi16(col2, quant2);
+ col3 = _mm_mullo_epi16(col3, quant3);
+ col4 = _mm_mullo_epi16(col4, quant4);
+ col5 = _mm_mullo_epi16(col5, quant5);
+ col6 = _mm_mullo_epi16(col6, quant6);
+ col7 = _mm_mullo_epi16(col7, quant7);
+
+ DO_IDCT(col, 1);
+
+ TRANSPOSE(out, row);
+ }
+
+ /* Pass 2: process rows */
+
+ DO_IDCT(row, 2);
+
+ IDCT_SAVE();
+}
diff --git a/simd/e2k/jquantf-e2k.c b/simd/e2k/jquantf-e2k.c
new file mode 100644
index 0000000..106e99a
--- /dev/null
+++ b/simd/e2k/jquantf-e2k.c
@@ -0,0 +1,121 @@
+/*
+ * Elbrus optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* FLOAT QUANTIZATION AND SAMPLE CONVERSION */
+
+#include "jsimd_e2k.h"
+
+#define LOAD_ROW(row) in##row = VEC_LD8(sample_data[row] + start_col)
+#define STORE_ROW(i) \
+ in0 = _mm_unpacklo_epi16(out##i, pb_zero); \
+ in1 = _mm_unpackhi_epi16(out##i, pb_zero); \
+ in0 = _mm_sub_epi32(in0, pd_cj); \
+ in1 = _mm_sub_epi32(in1, pd_cj); \
+ _mm_storeu_ps(workspace + i * 8, _mm_cvtepi32_ps(in0)); \
+ _mm_storeu_ps(workspace + i * 8 + 4, _mm_cvtepi32_ps(in1));
+
+void jsimd_convsamp_float_e2k(JSAMPARRAY sample_data, JDIMENSION start_col,
+ FAST_FLOAT *workspace)
+{
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i out0, out1, out2, out3, out4, out5, out6, out7;
+
+ /* Constants */
+ __m128i pd_cj = _mm_set1_epi32(CENTERJSAMPLE),
+ pb_zero = _mm_setzero_si128();
+
+ LOAD_ROW(0);
+ LOAD_ROW(1);
+ LOAD_ROW(2);
+ LOAD_ROW(3);
+ LOAD_ROW(4);
+ LOAD_ROW(5);
+ LOAD_ROW(6);
+ LOAD_ROW(7);
+
+ out0 = _mm_unpacklo_epi8(in0, pb_zero);
+ out1 = _mm_unpacklo_epi8(in1, pb_zero);
+ out2 = _mm_unpacklo_epi8(in2, pb_zero);
+ out3 = _mm_unpacklo_epi8(in3, pb_zero);
+ out4 = _mm_unpacklo_epi8(in4, pb_zero);
+ out5 = _mm_unpacklo_epi8(in5, pb_zero);
+ out6 = _mm_unpacklo_epi8(in6, pb_zero);
+ out7 = _mm_unpacklo_epi8(in7, pb_zero);
+
+ STORE_ROW(0)
+ STORE_ROW(1)
+ STORE_ROW(2)
+ STORE_ROW(3)
+ STORE_ROW(4)
+ STORE_ROW(5)
+ STORE_ROW(6)
+ STORE_ROW(7)
+}
+
+void jsimd_quantize_float_e2k(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+ FAST_FLOAT *workspace)
+{
+ int i = 0;
+ __m128 row0, row1, row2, row3, recip0, recip1, recip2, recip3;
+ __m128i out0, out1;
+#ifdef JSIMD_SAME_ROUNDING
+ __m128 pd_f16k5 = _mm_set1_ps(16384.5f);
+ __m128i pw_m16k = _mm_set1_epi16(-16384);
+#endif
+
+ PRAGMA_E2K("ivdep")
+ for (; i < 4; i++, workspace += 16, divisors += 16, coef_block += 16) {
+ row0 = _mm_loadu_ps(workspace + 0 * 4);
+ row1 = _mm_loadu_ps(workspace + 1 * 4);
+ row2 = _mm_loadu_ps(workspace + 2 * 4);
+ row3 = _mm_loadu_ps(workspace + 3 * 4);
+
+ recip0 = _mm_loadu_ps(divisors + 0 * 4);
+ recip1 = _mm_loadu_ps(divisors + 1 * 4);
+ recip2 = _mm_loadu_ps(divisors + 2 * 4);
+ recip3 = _mm_loadu_ps(divisors + 3 * 4);
+
+ row0 = _mm_mul_ps(row0, recip0);
+ row1 = _mm_mul_ps(row1, recip1);
+ row2 = _mm_mul_ps(row2, recip2);
+ row3 = _mm_mul_ps(row3, recip3);
+
+#ifdef JSIMD_SAME_ROUNDING
+ row0 = _mm_add_ps(row0, pd_f16k5);
+ row1 = _mm_add_ps(row1, pd_f16k5);
+ row2 = _mm_add_ps(row2, pd_f16k5);
+ row3 = _mm_add_ps(row3, pd_f16k5);
+
+ out0 = _mm_packs_epi32(_mm_cvttps_epi32(row0), _mm_cvttps_epi32(row1));
+ out1 = _mm_packs_epi32(_mm_cvttps_epi32(row2), _mm_cvttps_epi32(row3));
+
+ out0 = _mm_add_epi16(out0, pw_m16k);
+ out1 = _mm_add_epi16(out1, pw_m16k);
+#else
+ out0 = _mm_packs_epi32(_mm_cvtps_epi32(row0), _mm_cvtps_epi32(row1));
+ out1 = _mm_packs_epi32(_mm_cvtps_epi32(row2), _mm_cvtps_epi32(row3));
+#endif
+ VEC_ST(coef_block, out0);
+ VEC_ST(coef_block + 8, out1);
+ }
+}
diff --git a/simd/e2k/jquanti-e2k.c b/simd/e2k/jquanti-e2k.c
new file mode 100644
index 0000000..a3e1ff1
--- /dev/null
+++ b/simd/e2k/jquanti-e2k.c
@@ -0,0 +1,178 @@
+/*
+ * Elbrus optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */
+
+#include "jsimd_e2k.h"
+
+#define LOAD_ROW(row) in##row = VEC_LD8(sample_data[row] + start_col)
+
+void jsimd_convsamp_e2k(JSAMPARRAY sample_data, JDIMENSION start_col,
+ DCTELEM *workspace)
+{
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i out0, out1, out2, out3, out4, out5, out6, out7;
+
+ /* Constants */
+ __m128i pw_cj = _mm_set1_epi16(CENTERJSAMPLE),
+ pb_zero = _mm_setzero_si128();
+
+ LOAD_ROW(0);
+ LOAD_ROW(1);
+ LOAD_ROW(2);
+ LOAD_ROW(3);
+ LOAD_ROW(4);
+ LOAD_ROW(5);
+ LOAD_ROW(6);
+ LOAD_ROW(7);
+
+ out0 = _mm_unpacklo_epi8(in0, pb_zero);
+ out1 = _mm_unpacklo_epi8(in1, pb_zero);
+ out2 = _mm_unpacklo_epi8(in2, pb_zero);
+ out3 = _mm_unpacklo_epi8(in3, pb_zero);
+ out4 = _mm_unpacklo_epi8(in4, pb_zero);
+ out5 = _mm_unpacklo_epi8(in5, pb_zero);
+ out6 = _mm_unpacklo_epi8(in6, pb_zero);
+ out7 = _mm_unpacklo_epi8(in7, pb_zero);
+
+ out0 = _mm_sub_epi16(out0, pw_cj);
+ out1 = _mm_sub_epi16(out1, pw_cj);
+ out2 = _mm_sub_epi16(out2, pw_cj);
+ out3 = _mm_sub_epi16(out3, pw_cj);
+ out4 = _mm_sub_epi16(out4, pw_cj);
+ out5 = _mm_sub_epi16(out5, pw_cj);
+ out6 = _mm_sub_epi16(out6, pw_cj);
+ out7 = _mm_sub_epi16(out7, pw_cj);
+
+ VEC_ST(workspace + 0 * 8, out0);
+ VEC_ST(workspace + 1 * 8, out1);
+ VEC_ST(workspace + 2 * 8, out2);
+ VEC_ST(workspace + 3 * 8, out3);
+ VEC_ST(workspace + 4 * 8, out4);
+ VEC_ST(workspace + 5 * 8, out5);
+ VEC_ST(workspace + 6 * 8, out6);
+ VEC_ST(workspace + 7 * 8, out7);
+}
+
+
+#define WORD_BIT 16
+#define MULTIPLY(vs0, vs1, out) out = _mm_mulhi_epu16(vs0, vs1)
+
+void jsimd_quantize_e2k(JCOEFPTR coef_block, DCTELEM *divisors,
+ DCTELEM *workspace)
+{
+ __m128i row0, row1, row2, row3, row4, row5, row6, row7,
+ row0s, row1s, row2s, row3s, row4s, row5s, row6s, row7s,
+ corr0, corr1, corr2, corr3, corr4, corr5, corr6, corr7,
+ recip0, recip1, recip2, recip3, recip4, recip5, recip6, recip7,
+ scale0, scale1, scale2, scale3, scale4, scale5, scale6, scale7;
+
+ row0s = VEC_LD(workspace + 0 * 8);
+ row1s = VEC_LD(workspace + 1 * 8);
+ row2s = VEC_LD(workspace + 2 * 8);
+ row3s = VEC_LD(workspace + 3 * 8);
+ row4s = VEC_LD(workspace + 4 * 8);
+ row5s = VEC_LD(workspace + 5 * 8);
+ row6s = VEC_LD(workspace + 6 * 8);
+ row7s = VEC_LD(workspace + 7 * 8);
+ row0 = _mm_abs_epi16(row0s);
+ row1 = _mm_abs_epi16(row1s);
+ row2 = _mm_abs_epi16(row2s);
+ row3 = _mm_abs_epi16(row3s);
+ row4 = _mm_abs_epi16(row4s);
+ row5 = _mm_abs_epi16(row5s);
+ row6 = _mm_abs_epi16(row6s);
+ row7 = _mm_abs_epi16(row7s);
+
+ corr0 = VEC_LD(divisors + DCTSIZE2 + 0 * 8);
+ corr1 = VEC_LD(divisors + DCTSIZE2 + 1 * 8);
+ corr2 = VEC_LD(divisors + DCTSIZE2 + 2 * 8);
+ corr3 = VEC_LD(divisors + DCTSIZE2 + 3 * 8);
+ corr4 = VEC_LD(divisors + DCTSIZE2 + 4 * 8);
+ corr5 = VEC_LD(divisors + DCTSIZE2 + 5 * 8);
+ corr6 = VEC_LD(divisors + DCTSIZE2 + 6 * 8);
+ corr7 = VEC_LD(divisors + DCTSIZE2 + 7 * 8);
+
+ row0 = _mm_add_epi16(row0, corr0);
+ row1 = _mm_add_epi16(row1, corr1);
+ row2 = _mm_add_epi16(row2, corr2);
+ row3 = _mm_add_epi16(row3, corr3);
+ row4 = _mm_add_epi16(row4, corr4);
+ row5 = _mm_add_epi16(row5, corr5);
+ row6 = _mm_add_epi16(row6, corr6);
+ row7 = _mm_add_epi16(row7, corr7);
+
+ recip0 = VEC_LD(divisors + 0 * 8);
+ recip1 = VEC_LD(divisors + 1 * 8);
+ recip2 = VEC_LD(divisors + 2 * 8);
+ recip3 = VEC_LD(divisors + 3 * 8);
+ recip4 = VEC_LD(divisors + 4 * 8);
+ recip5 = VEC_LD(divisors + 5 * 8);
+ recip6 = VEC_LD(divisors + 6 * 8);
+ recip7 = VEC_LD(divisors + 7 * 8);
+
+ MULTIPLY(row0, recip0, row0);
+ MULTIPLY(row1, recip1, row1);
+ MULTIPLY(row2, recip2, row2);
+ MULTIPLY(row3, recip3, row3);
+ MULTIPLY(row4, recip4, row4);
+ MULTIPLY(row5, recip5, row5);
+ MULTIPLY(row6, recip6, row6);
+ MULTIPLY(row7, recip7, row7);
+
+ scale0 = VEC_LD(divisors + DCTSIZE2 * 2 + 0 * 8);
+ scale1 = VEC_LD(divisors + DCTSIZE2 * 2 + 1 * 8);
+ scale2 = VEC_LD(divisors + DCTSIZE2 * 2 + 2 * 8);
+ scale3 = VEC_LD(divisors + DCTSIZE2 * 2 + 3 * 8);
+ scale4 = VEC_LD(divisors + DCTSIZE2 * 2 + 4 * 8);
+ scale5 = VEC_LD(divisors + DCTSIZE2 * 2 + 5 * 8);
+ scale6 = VEC_LD(divisors + DCTSIZE2 * 2 + 6 * 8);
+ scale7 = VEC_LD(divisors + DCTSIZE2 * 2 + 7 * 8);
+
+ MULTIPLY(row0, scale0, row0);
+ MULTIPLY(row1, scale1, row1);
+ MULTIPLY(row2, scale2, row2);
+ MULTIPLY(row3, scale3, row3);
+ MULTIPLY(row4, scale4, row4);
+ MULTIPLY(row5, scale5, row5);
+ MULTIPLY(row6, scale6, row6);
+ MULTIPLY(row7, scale7, row7);
+
+ row0 = _mm_sign_epi16(row0, row0s);
+ row1 = _mm_sign_epi16(row1, row1s);
+ row2 = _mm_sign_epi16(row2, row2s);
+ row3 = _mm_sign_epi16(row3, row3s);
+ row4 = _mm_sign_epi16(row4, row4s);
+ row5 = _mm_sign_epi16(row5, row5s);
+ row6 = _mm_sign_epi16(row6, row6s);
+ row7 = _mm_sign_epi16(row7, row7s);
+
+ VEC_ST(coef_block + 0 * 8, row0);
+ VEC_ST(coef_block + 1 * 8, row1);
+ VEC_ST(coef_block + 2 * 8, row2);
+ VEC_ST(coef_block + 3 * 8, row3);
+ VEC_ST(coef_block + 4 * 8, row4);
+ VEC_ST(coef_block + 5 * 8, row5);
+ VEC_ST(coef_block + 6 * 8, row6);
+ VEC_ST(coef_block + 7 * 8, row7);
+}
diff --git a/simd/e2k/jsimd.c b/simd/e2k/jsimd.c
new file mode 100644
index 0000000..f8c0465
--- /dev/null
+++ b/simd/e2k/jsimd.c
@@ -0,0 +1,761 @@
+/*
+ * jsimd_e2k.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009-2011, 2014-2016, 2018, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * PowerPC architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "jsimd_api_e2k.h"
+
+static unsigned int simd_support = ~0;
+static unsigned int simd_huffman = 1;
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+ char *env = NULL;
+#endif
+
+ if (simd_support != ~0U)
+ return;
+
+ simd_support = JSIMD_SSE2;
+
+#ifndef NO_GETENV
+ /* Force different settings through environment variables */
+ env = getenv("JSIMD_FORCENONE");
+ if ((env != NULL) && (strcmp(env, "1") == 0))
+ simd_support = 0;
+ env = getenv("JSIMD_NOHUFFENC");
+ if ((env != NULL) && (strcmp(env, "1") == 0))
+ simd_huffman = 0;
+#endif
+}
+
+static inline int color_space_idx(J_COLOR_SPACE color_space) {
+ switch (color_space) {
+ case JCS_EXT_RGB:
+ return 1 + (EXT_RGB_PIXELSIZE != 3) * 16;
+ case JCS_EXT_RGBX:
+ case JCS_EXT_RGBA:
+ return 2 + (EXT_RGBX_PIXELSIZE != 3) * 16;
+ case JCS_EXT_BGR:
+ return 3 + (EXT_BGR_PIXELSIZE != 3) * 16;
+ case JCS_EXT_BGRX:
+ case JCS_EXT_BGRA:
+ return 4 + (EXT_BGRX_PIXELSIZE != 3) * 16;
+ case JCS_EXT_XBGR:
+ case JCS_EXT_ABGR:
+ return 5 + (EXT_XBGR_PIXELSIZE != 3) * 16;
+ case JCS_EXT_XRGB:
+ case JCS_EXT_ARGB:
+ return 6 + (EXT_XRGB_PIXELSIZE != 3) * 16;
+ default:
+ break;
+ }
+ return 0 + (RGB_PIXELSIZE != 3) * 16;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+ return 0;
+
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ void (*e2kfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int, int);
+ int idx = color_space_idx(cinfo->in_color_space);
+
+ e2kfct = idx < 16 ? jsimd_rgb3_ycc_convert_e2k :
+ jsimd_rgb4_ycc_convert_e2k;
+ idx &= 15;
+
+ e2kfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows, idx);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+ JSAMPIMAGE output_buf, JDIMENSION output_row,
+ int num_rows)
+{
+ void (*e2kfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int, int);
+ int idx = color_space_idx(cinfo->in_color_space);
+
+ e2kfct = idx < 16 ? jsimd_rgb3_gray_convert_e2k :
+ jsimd_rgb4_gray_convert_e2k;
+ idx &= 15;
+
+ e2kfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows, idx);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+ void (*e2kfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int, int);
+ int idx = color_space_idx(cinfo->out_color_space);
+
+ e2kfct = idx < 16 ? jsimd_ycc_rgb3_convert_e2k :
+ jsimd_ycc_rgb4_convert_e2k;
+ idx &= 15;
+
+ e2kfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows, idx);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION input_row, JSAMPARRAY output_buf,
+ int num_rows)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ jsimd_h2v2_downsample_e2k(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor,
+ compptr->width_in_blocks, input_data,
+ output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+ jsimd_h2v1_downsample_e2k(cinfo->image_width, cinfo->max_v_samp_factor,
+ compptr->v_samp_factor,
+ compptr->width_in_blocks, input_data,
+ output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v2_upsample_e2k(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v1_upsample_e2k(cinfo->max_v_samp_factor, cinfo->output_width,
+ input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v2_fancy_upsample_e2k(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+ jsimd_h2v1_fancy_upsample_e2k(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data,
+ output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+ void (*e2kfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JDIMENSION, JSAMPARRAY, int);
+ int idx = color_space_idx(cinfo->out_color_space);
+
+ e2kfct = idx < 16 ? jsimd_ycc_rgb3_merged_upsample_e2k :
+ jsimd_ycc_rgb4_merged_upsample_e2k;
+ idx &= 15;
+
+ e2kfct(cinfo->output_width, input_buf, in_row_group_ctr,
+ in_row_group_ctr * 2, output_buf, idx);
+ e2kfct(cinfo->output_width, input_buf, in_row_group_ctr,
+ in_row_group_ctr * 2 + 1, output_buf + 1, idx);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+ void (*e2kfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JDIMENSION, JSAMPARRAY, int);
+ int idx = color_space_idx(cinfo->out_color_space);
+
+ e2kfct = idx < 16 ? jsimd_ycc_rgb3_merged_upsample_e2k :
+ jsimd_ycc_rgb4_merged_upsample_e2k;
+ idx &= 15;
+
+ e2kfct(cinfo->output_width, input_buf, in_row_group_ctr,
+ in_row_group_ctr, output_buf, idx);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(FAST_FLOAT) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+ DCTELEM *workspace)
+{
+ jsimd_convsamp_e2k(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+ FAST_FLOAT *workspace)
+{
+ jsimd_convsamp_float_e2k(sample_data, start_col, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(FAST_FLOAT) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+ jsimd_fdct_islow_e2k(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+ jsimd_fdct_ifast_e2k(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+ jsimd_fdct_float_e2k(data);
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (sizeof(DCTELEM) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (sizeof(FAST_FLOAT) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+ jsimd_quantize_e2k(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+ FAST_FLOAT *workspace)
+{
+ jsimd_quantize_float_e2k(coef_block, divisors, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(ISLOW_MULT_TYPE) != 2)
+ return 0;
+
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(IFAST_MULT_TYPE) != 2)
+ return 0;
+ if (IFAST_SCALE_BITS != 2)
+ return 0;
+
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+ init_simd();
+
+ /* The code is optimised for these values only */
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(FAST_FLOAT) != 4)
+ return 0;
+ if (sizeof(FLOAT_MULT_TYPE) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_SSE2)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_islow_e2k(compptr->dct_table, coef_block, output_buf,
+ output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_ifast_e2k(compptr->dct_table, coef_block, output_buf,
+ output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+ JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col)
+{
+ jsimd_idct_float_e2k(compptr->dct_table, coef_block, output_buf,
+ output_col);
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+
+ if ((simd_support & JSIMD_SSE2) && simd_huffman)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+ int last_dc_val, c_derived_tbl *dctbl,
+ c_derived_tbl *actbl)
+{
+ return jsimd_huff_encode_one_block_e2k(state, buffer, block, last_dc_val,
+ dctbl, actbl);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if ((simd_support & JSIMD_SSE2) && simd_huffman)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, UJCOEF *values, size_t *zerobits)
+{
+ jsimd_encode_mcu_AC_first_prepare_e2k(block, jpeg_natural_order_start,
+ Sl, Al, (JCOEF*)values, zerobits);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+ init_simd();
+
+ if (DCTSIZE != 8)
+ return 0;
+ if (sizeof(JCOEF) != 2)
+ return 0;
+ if ((simd_support & JSIMD_SSE2) && simd_huffman)
+ return 1;
+
+ return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+ const int *jpeg_natural_order_start, int Sl,
+ int Al, UJCOEF *absvalues, size_t *bits)
+{
+ return jsimd_encode_mcu_AC_refine_prepare_e2k(block,
+ jpeg_natural_order_start,
+ Sl, Al, (JCOEF*)absvalues, bits);
+}
diff --git a/simd/e2k/jsimd_api_e2k.h b/simd/e2k/jsimd_api_e2k.h
new file mode 100644
index 0000000..d857203
--- /dev/null
+++ b/simd/e2k/jsimd_api_e2k.h
@@ -0,0 +1,94 @@
+/*
+ * Elbrus optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* Function declarations */
+
+#define CONVERT_DECL(n) \
+EXTERN(void) jsimd_rgb##n##_ycc_convert_e2k(JDIMENSION img_width, \
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf, \
+ JDIMENSION output_row, int num_rows, int shuf_idx); \
+EXTERN(void) jsimd_rgb##n##_gray_convert_e2k(JDIMENSION img_width, \
+ JSAMPARRAY input_buf, JSAMPIMAGE output_buf, \
+ JDIMENSION output_row, int num_rows, int shuf_idx); \
+EXTERN(void) jsimd_ycc_rgb##n##_convert_e2k(JDIMENSION out_width, \
+ JSAMPIMAGE input_buf, JDIMENSION input_row, \
+ JSAMPARRAY output_buf, int num_rows, int shuf_idx); \
+EXTERN(void) jsimd_ycc_rgb##n##_convert_e2k(JDIMENSION out_width, \
+ JSAMPIMAGE input_buf, JDIMENSION input_row, JSAMPARRAY output_buf, \
+ int num_rows, int shuf_idx); \
+EXTERN(void) jsimd_ycc_rgb##n##_merged_upsample_e2k(JDIMENSION out_width, \
+ JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, \
+ JDIMENSION in_row_group_ctr_y, JSAMPARRAY output_buf, int shuf_idx); \
+
+CONVERT_DECL(3)
+CONVERT_DECL(4)
+
+EXTERN(void) jsimd_h2v1_downsample_e2k
+ (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+ JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+EXTERN(void) jsimd_h2v2_downsample_e2k
+ (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+ JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+#define UPSAMPLE_DECL(name) \
+EXTERN(void) jsimd_##name##_upsample_e2k \
+ (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data, \
+ JSAMPARRAY *output_data_ptr);
+
+UPSAMPLE_DECL(h2v1)
+UPSAMPLE_DECL(h2v2)
+UPSAMPLE_DECL(h2v1_fancy)
+UPSAMPLE_DECL(h2v2_fancy)
+
+EXTERN(void) jsimd_convsamp_e2k
+ (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+EXTERN(void) jsimd_convsamp_float_e2k
+ (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
+
+EXTERN(void) jsimd_fdct_islow_e2k(DCTELEM *data);
+EXTERN(void) jsimd_fdct_ifast_e2k(DCTELEM *data);
+EXTERN(void) jsimd_fdct_float_e2k(FAST_FLOAT *data);
+EXTERN(void) jsimd_quantize_e2k
+ (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+EXTERN(void) jsimd_quantize_float_e2k
+ (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
+EXTERN(void) jsimd_idct_islow_e2k
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
+EXTERN(void) jsimd_idct_ifast_e2k
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
+EXTERN(void) jsimd_idct_float_e2k
+ (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+ JDIMENSION output_col);
+
+EXTERN(JOCTET *) jsimd_huff_encode_one_block_e2k
+ (void *state, JOCTET *buffer, JCOEFPTR block,
+ int last_dc_val, c_derived_tbl *dctbl, c_derived_tbl *actbl);
+
+EXTERN(void) jsimd_encode_mcu_AC_first_prepare_e2k
+ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+ JCOEF *values, size_t *zerobits);
+
+EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_e2k
+ (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+ JCOEF *absvalues, size_t *bits);
diff --git a/simd/e2k/jsimd_e2k.h b/simd/e2k/jsimd_e2k.h
new file mode 100644
index 0000000..15d6262
--- /dev/null
+++ b/simd/e2k/jsimd_e2k.h
@@ -0,0 +1,207 @@
+/*
+ * Elbrus optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "jsimd_api_e2k.h"
+#include <stdint.h>
+#include <smmintrin.h> /* SSE4.1 */
+
+
+/* Common code */
+
+#define __4X2(a, b) a, b, a, b, a, b, a, b
+#define ALWAYS_INLINE __attribute__((__always_inline__)) inline
+
+#ifdef __e2k__
+#define PRAGMA_E2K _Pragma
+#define _mm_shuffle2_pi8(a, b, c) \
+ ((__m64)__builtin_e2k_pshufb((uint64_t)(b), (uint64_t)(a), (uint64_t)(c)))
+#define _mm_shuffle2_epi8(a, b, c) \
+ ((__m128i)__builtin_e2k_qppermb((__v2di)(b), (__v2di)(a), (__v2di)(c)))
+#define _mm_blendv_pi8(a, b, c) \
+ ((__m64)__builtin_e2k_pmerge((uint64_t)(a), (uint64_t)(b), (uint64_t)(c)))
+#else
+#define PRAGMA_E2K(x)
+#define _mm_shuffle2_pi8(a, b, c) \
+ _mm_movepi64_pi64(_mm_shuffle_epi8(_mm_unpacklo_epi64( \
+ _mm_movpi64_epi64(a), _mm_movpi64_epi64(b)), _mm_movpi64_epi64(c)))
+#define _mm_shuffle2_epi8(a, b, c) \
+ _mm_blendv_epi8(_mm_shuffle_epi8(a, c), _mm_shuffle_epi8(b, c), \
+ _mm_slli_epi16(c, 3))
+#define _mm_blendv_pi8(a, b, c) \
+ _mm_movepi64_pi64(_mm_blendv_epi8(_mm_movpi64_epi64(a), \
+ _mm_movpi64_epi64(b), _mm_movpi64_epi64(c)))
+
+#define BITREV_ROUND(c, i) a = (a & c) << i | (a >> i & c);
+static ALWAYS_INLINE uint64_t __builtin_e2k_bitrevd(uint64_t a) {
+ BITREV_ROUND(0x5555555555555555ll, 1)
+ BITREV_ROUND(0x3333333333333333ll, 2)
+ BITREV_ROUND(0x0F0F0F0F0F0F0F0Fll, 4)
+ BITREV_ROUND(0x00FF00FF00FF00FFll, 8)
+ BITREV_ROUND(0x0000FFFF0000FFFFll, 16)
+ return a >> 32 | a << 32;
+}
+
+static ALWAYS_INLINE uint64_t __builtin_e2k_insfd(uint64_t a, uint64_t b, uint64_t c) {
+ int n = b & 63;
+ a = a >> n | a << (64 - n);
+ return c ^ ((a ^ c) & (~0ll << (b >> 6 & 63)));
+}
+#endif
+
+#if defined(__iset__) && __iset__ >= 5
+static ALWAYS_INLINE __m128i _mm_packhi_epi32(__m128i a, __m128i b) {
+ __m128i index = _mm_setr_epi8(
+ 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31);
+ return _mm_shuffle2_epi8(a, b, index);
+}
+
+#define VEC_ISZERO(a) !_mm_cvtsi128_si64(_mm_packs_epi16(a, a))
+#else
+static ALWAYS_INLINE __m128i _mm_packhi_epi32(__m128i a, __m128i b) {
+ union { __m128i v; __m64 d[2]; } l = { a }, h = { b }, x;
+ __m64 index = _mm_setr_pi8(2, 3, 6, 7, 10, 11, 14, 15);
+ x.d[0] = _mm_shuffle2_pi8(l.d[0], l.d[1], index);
+ x.d[1] = _mm_shuffle2_pi8(h.d[0], h.d[1], index);
+ return x.v;
+}
+
+static ALWAYS_INLINE uint64_t vec_isnonzero(__m128i a) {
+ __v2di x = (__v2di)a;
+ return x[0] | x[1];
+}
+
+#define VEC_ISZERO(a) !vec_isnonzero(a)
+#endif
+
+#define VEC_ALIGNR8(a, b) _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b), _mm_castsi128_pd(a), 1))
+
+#define TRANSPOSE_FLOAT(a, b, c, d, e, f, g, h) \
+ tmp0 = _mm_unpacklo_ps(a, b); \
+ tmp1 = _mm_unpackhi_ps(a, b); \
+ tmp2 = _mm_unpacklo_ps(c, d); \
+ tmp3 = _mm_unpackhi_ps(c, d); \
+ e = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); \
+ f = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); \
+ g = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); \
+ h = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
+
+#define TRANSPOSE8(a, b) \
+ b##0 = _mm_unpacklo_epi8(a##0, a##2); \
+ b##1 = _mm_unpackhi_epi8(a##0, a##2); \
+ b##2 = _mm_unpacklo_epi8(a##1, a##3); \
+ b##3 = _mm_unpackhi_epi8(a##1, a##3);
+
+#define TRANSPOSE16(a, b) \
+ b##0 = _mm_unpacklo_epi16(a##0, a##2); \
+ b##1 = _mm_unpackhi_epi16(a##0, a##2); \
+ b##2 = _mm_unpacklo_epi16(a##1, a##3); \
+ b##3 = _mm_unpackhi_epi16(a##1, a##3); \
+ b##4 = _mm_unpacklo_epi16(a##4, a##6); \
+ b##5 = _mm_unpackhi_epi16(a##4, a##6); \
+ b##6 = _mm_unpacklo_epi16(a##5, a##7); \
+ b##7 = _mm_unpackhi_epi16(a##5, a##7);
+
+#define TRANSPOSE(a, b) \
+ TRANSPOSE16(a, b) TRANSPOSE16(b, a) \
+ b##0 = _mm_unpacklo_epi64(a##0, a##4); \
+ b##1 = _mm_unpackhi_epi64(a##0, a##4); \
+ b##2 = _mm_unpacklo_epi64(a##1, a##5); \
+ b##3 = _mm_unpackhi_epi64(a##1, a##5); \
+ b##4 = _mm_unpacklo_epi64(a##2, a##6); \
+ b##5 = _mm_unpackhi_epi64(a##2, a##6); \
+ b##6 = _mm_unpacklo_epi64(a##3, a##7); \
+ b##7 = _mm_unpackhi_epi64(a##3, a##7);
+
+#define IDCT_SAVE() { \
+ __m128i pb_cj = _mm_set1_epi8((int8_t)CENTERJSAMPLE); \
+ \
+ row0 = _mm_xor_si128(_mm_packs_epi16(out0, out1), pb_cj); \
+ row1 = _mm_xor_si128(_mm_packs_epi16(out2, out3), pb_cj); \
+ row2 = _mm_xor_si128(_mm_packs_epi16(out4, out5), pb_cj); \
+ row3 = _mm_xor_si128(_mm_packs_epi16(out6, out7), pb_cj); \
+ \
+ TRANSPOSE8(row, col) TRANSPOSE8(col, row) TRANSPOSE8(row, col) \
+ \
+ VEC_STL(output_buf[0] + output_col, col0); \
+ VEC_STH(output_buf[1] + output_col, col0); \
+ VEC_STL(output_buf[2] + output_col, col1); \
+ VEC_STH(output_buf[3] + output_col, col1); \
+ VEC_STL(output_buf[4] + output_col, col2); \
+ VEC_STH(output_buf[5] + output_col, col2); \
+ VEC_STL(output_buf[6] + output_col, col3); \
+ VEC_STH(output_buf[7] + output_col, col3); \
+}
+
+#define IDCT_SPLAT8(col0) { \
+ row3 = _mm_unpacklo_epi16(col0, col0); \
+ row7 = _mm_unpackhi_epi16(col0, col0); \
+ row1 = _mm_unpacklo_epi16(row3, row3); \
+ row3 = _mm_unpackhi_epi16(row3, row3); \
+ row5 = _mm_unpacklo_epi16(row7, row7); \
+ row7 = _mm_unpackhi_epi16(row7, row7); \
+ row0 = _mm_unpacklo_epi64(row1, row1); \
+ row1 = _mm_unpackhi_epi64(row1, row1); \
+ row2 = _mm_unpacklo_epi64(row3, row3); \
+ row3 = _mm_unpackhi_epi64(row3, row3); \
+ row4 = _mm_unpacklo_epi64(row5, row5); \
+ row5 = _mm_unpackhi_epi64(row5, row5); \
+ row6 = _mm_unpacklo_epi64(row7, row7); \
+ row7 = _mm_unpackhi_epi64(row7, row7); \
+}
+
+#ifndef min
+#define min(a, b) ((a) < (b) ? (a) : (b))
+#endif
+
+#define VEC_LD(a) _mm_loadu_si128((const __m128i*)(a))
+#define VEC_ST(a, b) _mm_storeu_si128((__m128i*)(a), b)
+#define VEC_LD8(a) _mm_loadl_epi64((const __m128i*)(a))
+#define VEC_STL(a, b) _mm_storel_epi64((__m128i*)(a), b)
+#define VEC_STH(a, b) _mm_storeh_pd((double*)(a), _mm_castsi128_pd(b));
+#define VEC_SPLAT(v, i) _mm_shuffle_epi8(v, _mm_set1_epi16((i) * 2 | ((i) * 2 + 1) << 8))
+
+#if !defined(__iset__) || __iset__ < 5
+#define NEED_ALIGN8
+#define ALIGN8_COMMON uint64_t src_shr; __m64 src_tmp0, src_tmp1;
+#define ALIGN8_VARS(src) __m64 *src##_ptr, src##_next, src##_index;
+#define ALIGN8_START(ptr, src) \
+ src_shr = (intptr_t)(ptr - 1) & 7; \
+ src##_ptr = (__m64*)((intptr_t)(ptr - 1) & -8); \
+ src##_next = src##_ptr[src_shr == 7]; \
+ src##_index = _mm_add_pi8(_mm_set1_pi8(src_shr), \
+ _mm_setr_pi8(1, 2, 3, 4, 5, 6, 7, 8));
+#define ALIGN8_READ16(v0, src, i) \
+ src_tmp1 = src##_ptr[i * 2 + 1]; \
+ src_tmp0 = _mm_shuffle2_pi8(src##_next, src_tmp1, src##_index); \
+ src##_next = src##_ptr[i * 2 + 2]; \
+ src_tmp1 = _mm_shuffle2_pi8(src_tmp1, src##_next, src##_index); \
+ v0 = _mm_setr_epi64(src_tmp0, src_tmp1);
+#endif
+
--
2.45.0