OpenJPH
Open-source implementation of JPEG2000 Part-15
Loading...
Searching...
No Matches
ojph_codestream_avx2.cpp
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2022, Aous Naman
6// Copyright (c) 2022, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2022, The University of New South Wales, Australia
8//
9// Redistribution and use in source and binary forms, with or without
10// modification, are permitted provided that the following conditions are
11// met:
12//
13// 1. Redistributions of source code must retain the above copyright
14// notice, this list of conditions and the following disclaimer.
15//
16// 2. Redistributions in binary form must reproduce the above copyright
17// notice, this list of conditions and the following disclaimer in the
18// documentation and/or other materials provided with the distribution.
19//
20// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31//***************************************************************************/
32// This file is part of the OpenJPH software implementation.
33// File: ojph_codestream_avx2.cpp
34// Author: Aous Naman
35// Date: 15 May 2022
36//***************************************************************************/
37
38#include <climits>
39#include <immintrin.h>
40#include "ojph_defs.h"
41#include "ojph_arch.h"
42
43namespace ojph {
44 namespace local {
45
48 {
49 __m128i x0 = _mm_loadu_si128((__m128i*)address);
50 __m128i x1 = _mm_loadu_si128((__m128i*)address + 1);
51 x0 = _mm_or_si128(x0, x1);
52 x1 = _mm_shuffle_epi32(x0, 0xEE); // x1 = x0[2,3,2,3]
53 x0 = _mm_or_si128(x0, x1);
54 x1 = _mm_shuffle_epi32(x0, 0x55); // x1 = x0[1,1,1,1]
55 x0 = _mm_or_si128(x0, x1);
56 ui32 t = (ui32)_mm_extract_epi32(x0, 0);
57 return t;
58 }
59
62 {
63 __m128i x0 = _mm_loadu_si128((__m128i*)address);
64 __m128i x1 = _mm_loadu_si128((__m128i*)address + 1);
65 x0 = _mm_or_si128(x0, x1);
66 x1 = _mm_shuffle_epi32(x0, 0xEE); // x1 = x0[2,3,2,3]
67 x0 = _mm_or_si128(x0, x1);
68 ui64 t;
69#ifdef OJPH_ARCH_X86_64
70 t = (ui64)_mm_extract_epi64(x0, 0);
71#elif (defined OJPH_ARCH_I386)
72 t = (ui64)(ui32)_mm_extract_epi32(x0, 0);
73 t |= (ui64)(ui32)_mm_extract_epi32(x0, 1) << 32;
74#else
75 #error Error unsupport compiler
76#endif
77 return t;
78 }
79
81 void avx2_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
82 float delta_inv, ui32 count, ui32* max_val)
83 {
84 ojph_unused(delta_inv);
85
86 // convert to sign and magnitude and keep max_val
87 ui32 shift = 31 - K_max;
88 __m256i m0 = _mm256_set1_epi32(INT_MIN);
89 __m256i tmax = _mm256_loadu_si256((__m256i*)max_val);
90 __m256i *p = (__m256i*)sp;
91 for ( ; count >= 8; count -= 8, p += 1, dp += 8)
92 {
93 __m256i v = _mm256_loadu_si256(p);
94 __m256i sign = _mm256_and_si256(v, m0);
95 __m256i val = _mm256_abs_epi32(v);
96 val = _mm256_slli_epi32(val, (int)shift);
97 tmax = _mm256_or_si256(tmax, val);
98 val = _mm256_or_si256(val, sign);
99 _mm256_storeu_si256((__m256i*)dp, val);
100 }
101 if (count)
102 {
103 __m256i v = _mm256_loadu_si256(p);
104 __m256i sign = _mm256_and_si256(v, m0);
105 __m256i val = _mm256_abs_epi32(v);
106 val = _mm256_slli_epi32(val, (int)shift);
107
108 __m256i c = _mm256_set1_epi32((si32)count);
109 __m256i idx = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
110 __m256i mask = _mm256_cmpgt_epi32(c, idx);
111 c = _mm256_and_si256(val, mask);
112 tmax = _mm256_or_si256(tmax, c);
113
114 val = _mm256_or_si256(val, sign);
115 _mm256_storeu_si256((__m256i*)dp, val);
116 }
117 _mm256_storeu_si256((__m256i*)max_val, tmax);
118 }
119
121 void avx2_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
122 float delta_inv, ui32 count, ui32* max_val)
123 {
124 ojph_unused(K_max);
125
126 //quantize and convert to sign and magnitude and keep max_val
127 __m256 d = _mm256_set1_ps(delta_inv);
128 __m256i m0 = _mm256_set1_epi32(INT_MIN);
129 __m256i tmax = _mm256_loadu_si256((__m256i*)max_val);
130 float *p = (float*)sp;
131
132 for ( ; count >= 8; count -= 8, p += 8, dp += 8)
133 {
134 __m256 vf = _mm256_loadu_ps(p);
135 vf = _mm256_mul_ps(vf, d); // multiply
136 __m256i val = _mm256_cvtps_epi32(vf); // convert to int
137 __m256i sign = _mm256_and_si256(val, m0); // get sign
138 val = _mm256_abs_epi32(val);
139 tmax = _mm256_or_si256(tmax, val);
140 val = _mm256_or_si256(val, sign);
141 _mm256_storeu_si256((__m256i*)dp, val);
142 }
143 if (count)
144 {
145 __m256 vf = _mm256_loadu_ps(p);
146 vf = _mm256_mul_ps(vf, d); // multiply
147 __m256i val = _mm256_cvtps_epi32(vf); // convert to int
148 __m256i sign = _mm256_and_si256(val, m0); // get sign
149 val = _mm256_abs_epi32(val);
150
151 __m256i c = _mm256_set1_epi32((si32)count);
152 __m256i idx = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
153 __m256i mask = _mm256_cmpgt_epi32(c, idx);
154 c = _mm256_and_si256(val, mask);
155 tmax = _mm256_or_si256(tmax, c);
156
157 val = _mm256_or_si256(val, sign);
158 _mm256_storeu_si256((__m256i*)dp, val);
159 }
160 _mm256_storeu_si256((__m256i*)max_val, tmax);
161 }
162
164 void avx2_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
165 float delta, ui32 count)
166 {
167 ojph_unused(delta);
168 ui32 shift = 31 - K_max;
169 __m256i m1 = _mm256_set1_epi32(INT_MAX);
170 si32 *p = (si32*)dp;
171 for (ui32 i = 0; i < count; i += 8, sp += 8, p += 8)
172 {
173 __m256i v = _mm256_load_si256((__m256i*)sp);
174 __m256i val = _mm256_and_si256(v, m1);
175 val = _mm256_srli_epi32(val, (int)shift);
176 val = _mm256_sign_epi32(val, v);
177 _mm256_storeu_si256((__m256i*)p, val);
178 }
179 }
180
182 void avx2_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
183 float delta, ui32 count)
184 {
185 ojph_unused(K_max);
186 __m256i m1 = _mm256_set1_epi32(INT_MAX);
187 __m256 d = _mm256_set1_ps(delta);
188 float *p = (float*)dp;
189 for (ui32 i = 0; i < count; i += 8, sp += 8, p += 8)
190 {
191 __m256i v = _mm256_load_si256((__m256i*)sp);
192 __m256i vali = _mm256_and_si256(v, m1);
193 __m256 valf = _mm256_cvtepi32_ps(vali);
194 valf = _mm256_mul_ps(valf, d);
195 __m256i sign = _mm256_andnot_si256(m1, v);
196 valf = _mm256_or_ps(valf, _mm256_castsi256_ps(sign));
197 _mm256_storeu_ps(p, valf);
198 }
199 }
200
202 void avx2_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max,
203 float delta_inv, ui32 count, ui64* max_val)
204 {
205 ojph_unused(delta_inv);
206
207 // convert to sign and magnitude and keep max_val
208 ui32 shift = 63 - K_max;
209 __m256i m0 = _mm256_set1_epi64x(LLONG_MIN);
210 __m256i zero = _mm256_setzero_si256();
211 __m256i one = _mm256_set1_epi64x(1);
212 __m256i tmax = _mm256_loadu_si256((__m256i*)max_val);
213 __m256i *p = (__m256i*)sp;
214 for ( ; count >= 4; count -= 4, p += 1, dp += 4)
215 {
216 __m256i v = _mm256_loadu_si256(p);
217 __m256i sign = _mm256_cmpgt_epi64(zero, v);
218 __m256i val = _mm256_xor_si256(v, sign); // negate 1's complement
219 __m256i ones = _mm256_and_si256(sign, one);
220 val = _mm256_add_epi64(val, ones); // 2's complement
221 sign = _mm256_and_si256(sign, m0);
222 val = _mm256_slli_epi64(val, (int)shift);
223 tmax = _mm256_or_si256(tmax, val);
224 val = _mm256_or_si256(val, sign);
225 _mm256_storeu_si256((__m256i*)dp, val);
226 }
227 if (count)
228 {
229 __m256i v = _mm256_loadu_si256(p);
230 __m256i sign = _mm256_cmpgt_epi64(zero, v);
231 __m256i val = _mm256_xor_si256(v, sign); // negate 1's complement
232 __m256i ones = _mm256_and_si256(sign, one);
233 val = _mm256_add_epi64(val, ones); // 2's complement
234 sign = _mm256_and_si256(sign, m0);
235 val = _mm256_slli_epi64(val, (int)shift);
236
237 __m256i c = _mm256_set1_epi64x(count);
238 __m256i idx = _mm256_set_epi64x(3, 2, 1, 0);
239 __m256i mask = _mm256_cmpgt_epi64(c, idx);
240 c = _mm256_and_si256(val, mask);
241 tmax = _mm256_or_si256(tmax, c);
242
243 val = _mm256_or_si256(val, sign);
244 _mm256_storeu_si256((__m256i*)dp, val);
245 }
246 _mm256_storeu_si256((__m256i*)max_val, tmax);
247 }
248
250 void avx2_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max,
251 float delta, ui32 count)
252 {
253 ojph_unused(delta);
254
255 ui32 shift = 63 - K_max;
256 __m256i m1 = _mm256_set1_epi64x(LLONG_MAX);
257 __m256i zero = _mm256_setzero_si256();
258 __m256i one = _mm256_set1_epi64x(1);
259 si64 *p = (si64*)dp;
260 for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4)
261 {
262 __m256i v = _mm256_load_si256((__m256i*)sp);
263 __m256i val = _mm256_and_si256(v, m1);
264 val = _mm256_srli_epi64(val, (int)shift);
265 __m256i sign = _mm256_cmpgt_epi64(zero, v);
266 val = _mm256_xor_si256(val, sign); // negate 1's complement
267 __m256i ones = _mm256_and_si256(sign, one);
268 val = _mm256_add_epi64(val, ones); // 2's complement
269 _mm256_storeu_si256((__m256i*)p, val);
270 }
271 }
272 }
273}
ui64 avx2_find_max_val64(ui64 *address)
void avx2_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, float delta_inv, ui32 count, ui32 *max_val)
void avx2_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, float delta, ui32 count)
ui32 avx2_find_max_val32(ui32 *address)
void avx2_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, float delta_inv, ui32 count, ui32 *max_val)
void avx2_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, float delta_inv, ui32 count, ui64 *max_val)
void avx2_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, float delta, ui32 count)
void avx2_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, float delta, ui32 count)
int64_t si64
Definition ojph_defs.h:57
uint64_t ui64
Definition ojph_defs.h:56
int32_t si32
Definition ojph_defs.h:55
uint32_t ui32
Definition ojph_defs.h:54
#define ojph_unused(x)
Definition ojph_defs.h:78