59 v128_t x1, x0 = wasm_v128_load(address);
60 x1 = wasm_i32x4_shuffle(x0, x0, 2, 3, 2, 3);
61 x0 = wasm_v128_or(x0, x1);
62 x1 = wasm_i32x4_shuffle(x0, x0, 1, 1, 1, 1);
63 x0 = wasm_v128_or(x0, x1);
64 ui32 t = (
ui32)wasm_i32x4_extract_lane(x0, 0);
80 float delta_inv,
ui32 count,
ui32* max_val)
85 ui32 shift = 31 - K_max;
86 v128_t m0 = wasm_i32x4_splat(INT_MIN);
87 v128_t zero = wasm_i32x4_splat(0);
88 v128_t one = wasm_i32x4_splat(1);
89 v128_t tmax = wasm_v128_load(max_val);
91 for ( ; count >= 4; count -= 4, p += 4, dp += 4)
93 v128_t v = wasm_v128_load(p);
94 v128_t sign = wasm_i32x4_lt(v, zero);
95 v128_t val = wasm_v128_xor(v, sign);
96 v128_t ones = wasm_v128_and(sign, one);
97 val = wasm_i32x4_add(val, ones);
98 sign = wasm_v128_and(sign, m0);
99 val = wasm_i32x4_shl(val, shift);
100 tmax = wasm_v128_or(tmax, val);
101 val = wasm_v128_or(val, sign);
102 wasm_v128_store(dp, val);
106 v128_t v = wasm_v128_load(p);
107 v128_t sign = wasm_i32x4_lt(v, zero);
108 v128_t val = wasm_v128_xor(v, sign);
109 v128_t ones = wasm_v128_and(sign, one);
110 val = wasm_i32x4_add(val, ones);
111 sign = wasm_v128_and(sign, m0);
112 val = wasm_i32x4_shl(val, shift);
114 v128_t c = wasm_i32x4_splat((
si32)count);
115 v128_t idx = wasm_i32x4_make(0, 1, 2, 3);
116 v128_t mask = wasm_i32x4_gt(c, idx);
117 c = wasm_v128_and(val, mask);
118 tmax = wasm_v128_or(tmax, c);
120 val = wasm_v128_or(val, sign);
121 wasm_v128_store(dp, val);
123 wasm_v128_store(max_val, tmax);
128 float delta_inv,
ui32 count,
ui32* max_val)
134 v128_t d = wasm_f32x4_splat(delta_inv);
135 v128_t zero = wasm_i32x4_splat(0);
136 v128_t one = wasm_i32x4_splat(1);
137 v128_t tmax = wasm_v128_load(max_val);
138 float *p = (
float*)sp;
139 for ( ; count >= 4; count -= 4, p += 4, dp += 4)
141 v128_t vf = wasm_v128_load(p);
142 vf = wasm_f32x4_mul(vf, d);
143 v128_t val = wasm_i32x4_trunc_sat_f32x4(vf);
144 v128_t sign = wasm_i32x4_lt(val, zero);
145 val = wasm_v128_xor(val, sign);
146 v128_t ones = wasm_v128_and(sign, one);
147 val = wasm_i32x4_add(val, ones);
148 tmax = wasm_v128_or(tmax, val);
149 sign = wasm_i32x4_shl(sign, 31);
150 val = wasm_v128_or(val, sign);
151 wasm_v128_store(dp, val);
155 v128_t vf = wasm_v128_load(p);
156 vf = wasm_f32x4_mul(vf, d);
157 v128_t val = wasm_i32x4_trunc_sat_f32x4(vf);
158 v128_t sign = wasm_i32x4_lt(val, zero);
159 val = wasm_v128_xor(val, sign);
160 v128_t ones = wasm_v128_and(sign, one);
161 val = wasm_i32x4_add(val, ones);
163 v128_t c = wasm_i32x4_splat((
si32)count);
164 v128_t idx = wasm_i32x4_make(0, 1, 2, 3);
165 v128_t mask = wasm_i32x4_gt(c, idx);
166 c = wasm_v128_and(val, mask);
167 tmax = wasm_v128_or(tmax, c);
169 sign = wasm_i32x4_shl(sign, 31);
170 val = wasm_v128_or(val, sign);
171 wasm_v128_store(dp, val);
173 wasm_v128_store(max_val, tmax);
178 float delta,
ui32 count)
181 ui32 shift = 31 - K_max;
182 v128_t m1 = wasm_i32x4_splat(INT_MAX);
183 v128_t zero = wasm_i32x4_splat(0);
184 v128_t one = wasm_i32x4_splat(1);
186 for (
ui32 i = 0; i < count; i += 4, sp += 4, p += 4)
188 v128_t v = wasm_v128_load((v128_t*)sp);
189 v128_t val = wasm_v128_and(v, m1);
190 val = wasm_i32x4_shr(val, shift);
191 v128_t sign = wasm_i32x4_lt(v, zero);
192 val = wasm_v128_xor(val, sign);
193 v128_t ones = wasm_v128_and(sign, one);
194 val = wasm_i32x4_add(val, ones);
195 wasm_v128_store(p, val);
201 float delta,
ui32 count)
204 v128_t m1 = wasm_i32x4_splat(INT_MAX);
205 v128_t d = wasm_f32x4_splat(delta);
206 float *p = (
float*)dp;
207 for (
ui32 i = 0; i < count; i += 4, sp += 4, p += 4)
209 v128_t v = wasm_v128_load((v128_t*)sp);
210 v128_t vali = wasm_v128_and(v, m1);
211 v128_t valf = wasm_f32x4_convert_i32x4(vali);
212 valf = wasm_f32x4_mul(valf, d);
213 v128_t sign = wasm_v128_andnot(v, m1);
214 valf = wasm_v128_or(valf, sign);
215 wasm_v128_store(p, valf);
221 float delta_inv,
ui32 count,
ui64* max_val)
226 ui32 shift = 63 - K_max;
227 v128_t m0 = wasm_i64x2_splat(LLONG_MIN);
228 v128_t zero = wasm_i64x2_splat(0);
229 v128_t one = wasm_i64x2_splat(1);
230 v128_t tmax = wasm_v128_load(max_val);
232 for ( ; count >= 2; count -= 2, p += 2, dp += 2)
234 v128_t v = wasm_v128_load(p);
235 v128_t sign = wasm_i64x2_lt(v, zero);
236 v128_t val = wasm_v128_xor(v, sign);
237 v128_t ones = wasm_v128_and(sign, one);
238 val = wasm_i64x2_add(val, ones);
239 sign = wasm_v128_and(sign, m0);
240 val = wasm_i64x2_shl(val, shift);
241 tmax = wasm_v128_or(tmax, val);
242 val = wasm_v128_or(val, sign);
243 wasm_v128_store(dp, val);
247 v128_t v = wasm_v128_load(p);
248 v128_t sign = wasm_i64x2_lt(v, zero);
249 v128_t val = wasm_v128_xor(v, sign);
250 v128_t ones = wasm_v128_and(sign, one);
251 val = wasm_i64x2_add(val, ones);
252 sign = wasm_v128_and(sign, m0);
253 val = wasm_i64x2_shl(val, shift);
255 v128_t c = wasm_i32x4_make((
si32)0xFFFFFFFF, (
si32)0xFFFFFFFF, 0, 0);
256 c = wasm_v128_and(val, c);
257 tmax = wasm_v128_or(tmax, c);
259 val = wasm_v128_or(val, sign);
260 wasm_v128_store(dp, val);
263 wasm_v128_store(max_val, tmax);
268 float delta,
ui32 count)
271 ui32 shift = 63 - K_max;
272 v128_t m1 = wasm_i64x2_splat(LLONG_MAX);
273 v128_t zero = wasm_i64x2_splat(0);
274 v128_t one = wasm_i64x2_splat(1);
276 for (
ui32 i = 0; i < count; i += 2, sp += 2, p += 2)
278 v128_t v = wasm_v128_load((v128_t*)sp);
279 v128_t val = wasm_v128_and(v, m1);
280 val = wasm_i64x2_shr(val, shift);
281 v128_t sign = wasm_i64x2_lt(v, zero);
282 val = wasm_v128_xor(val, sign);
283 v128_t ones = wasm_v128_and(sign, one);
284 val = wasm_i64x2_add(val, ones);
285 wasm_v128_store(p, val);