OpenJPH
Open-source implementation of JPEG2000 Part-15
Loading...
Searching...
No Matches
ojph_block_encoder_avx2_apple.h
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2019, Aous Naman
6// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2019, The University of New South Wales, Australia
8// Copyright (c) 2024, Intel Corporation
9// Copyright (c) 2026, Osamu Watanabe
10//
11// Redistribution and use in source and binary forms, with or without
12// modification, are permitted provided that the following conditions are
13// met:
14//
15// 1. Redistributions of source code must retain the above copyright
16// notice, this list of conditions and the following disclaimer.
17//
18// 2. Redistributions in binary form must reproduce the above copyright
19// notice, this list of conditions and the following disclaimer in the
20// documentation and/or other materials provided with the distribution.
21//
22// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
23// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
24// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
25// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
26// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
28// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
29// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
30// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
31// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
32// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33//***************************************************************************/
34// This file is part of the OpenJPH software implementation.
35// File: ojph_block_encoder_avx2.cpp
36//***************************************************************************/
37
38#include "ojph_arch.h"
39#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
40
41#include <cassert>
42#include <cstring>
43#include <cstdint>
44#include <climits>
45#include <immintrin.h>
46#include <mutex>
47
48#include "ojph_mem.h"
49#include "ojph_arch.h"
50#include "ojph_block_encoder.h"
51#include "ojph_message.h"
52
53#ifdef OJPH_COMPILER_MSVC
54 #define likely(x) (x)
55 #define unlikely(x) (x)
56#else
57 #define likely(x) __builtin_expect((x), 1)
58 #define unlikely(x) __builtin_expect((x), 0)
59#endif
60
61namespace ojph {
62 namespace local {
63
65 // tables
67
68 //VLC encoding
69 // index is (c_q << 8) + (rho << 4) + eps
70 // data is (cwd << 8) + (cwd_len << 4) + eps
71 // table 0 is for the initial line of quads
72 static ui32 vlc_tbl0[2048];
73 static ui32 vlc_tbl1[2048];
74
75 //UVLC encoding
76 static ui32 ulvc_cwd_pre[33];
77 static int ulvc_cwd_pre_len[33];
78 static ui32 ulvc_cwd_suf[33];
79 static int ulvc_cwd_suf_len[33];
80
82 static bool vlc_init_tables()
83 {
84 struct vlc_src_table { int c_q, rho, u_off, e_k, e_1, cwd, cwd_len; };
85 vlc_src_table tbl0[] = {
86 #include "table0.h"
87 };
88 size_t tbl0_size = sizeof(tbl0) / sizeof(vlc_src_table);
89
90 si32 pattern_popcnt[16];
91 for (ui32 i = 0; i < 16; ++i)
92 pattern_popcnt[i] = (si32)population_count(i);
93
94 vlc_src_table* src_tbl = tbl0;
95 ui32 *tgt_tbl = vlc_tbl0;
96 size_t tbl_size = tbl0_size;
97 for (int i = 0; i < 2048; ++i)
98 {
99 int c_q = i >> 8, rho = (i >> 4) & 0xF, emb = i & 0xF;
100 if (((emb & rho) != emb) || (rho == 0 && c_q == 0))
101 tgt_tbl[i] = 0;
102 else
103 {
104 vlc_src_table *best_entry = NULL;
105 if (emb) // u_off = 1
106 {
107 int best_e_k = -1;
108 for (size_t j = 0; j < tbl_size; ++j)
109 {
110 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
111 if (src_tbl[j].u_off == 1)
112 if ((emb & src_tbl[j].e_k) == src_tbl[j].e_1)
113 {
114 //now we need to find the smallest cwd with the highest
115 // number of bits set in e_k
116 int ones_count = pattern_popcnt[src_tbl[j].e_k];
117 if (ones_count >= best_e_k)
118 {
119 best_entry = src_tbl + j;
120 best_e_k = ones_count;
121 }
122 }
123 }
124 }
125 else // u_off = 0
126 {
127 for (size_t j = 0; j < tbl_size; ++j)
128 {
129 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
130 if (src_tbl[j].u_off == 0)
131 {
132 best_entry = src_tbl + j;
133 break;
134 }
135 }
136 }
137 assert(best_entry);
138 tgt_tbl[i] = (ui16)((best_entry->cwd<<8) + (best_entry->cwd_len<<4)
139 + best_entry->e_k);
140 }
141 }
142
143 vlc_src_table tbl1[] = {
144 #include "table1.h"
145 };
146 size_t tbl1_size = sizeof(tbl1) / sizeof(vlc_src_table);
147
148 src_tbl = tbl1;
149 tgt_tbl = vlc_tbl1;
150 tbl_size = tbl1_size;
151 for (int i = 0; i < 2048; ++i)
152 {
153 int c_q = i >> 8, rho = (i >> 4) & 0xF, emb = i & 0xF;
154 if (((emb & rho) != emb) || (rho == 0 && c_q == 0))
155 tgt_tbl[i] = 0;
156 else
157 {
158 vlc_src_table *best_entry = NULL;
159 if (emb) // u_off = 1
160 {
161 int best_e_k = -1;
162 for (size_t j = 0; j < tbl_size; ++j)
163 {
164 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
165 if (src_tbl[j].u_off == 1)
166 if ((emb & src_tbl[j].e_k) == src_tbl[j].e_1)
167 {
168 //now we need to find the smallest cwd with the highest
169 // number of bits set in e_k
170 int ones_count = pattern_popcnt[src_tbl[j].e_k];
171 if (ones_count >= best_e_k)
172 {
173 best_entry = src_tbl + j;
174 best_e_k = ones_count;
175 }
176 }
177 }
178 }
179 else // u_off = 0
180 {
181 for (size_t j = 0; j < tbl_size; ++j)
182 {
183 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
184 if (src_tbl[j].u_off == 0)
185 {
186 best_entry = src_tbl + j;
187 break;
188 }
189 }
190 }
191 assert(best_entry);
192 tgt_tbl[i] = (ui16)((best_entry->cwd<<8) + (best_entry->cwd_len<<4)
193 + best_entry->e_k);
194 }
195 }
196
197
198 return true;
199 }
200
202 static bool uvlc_init_tables()
203 {
204 //code goes from 0 to 31, extension and 32 are not supported here
205 ulvc_cwd_pre[0] = 0; ulvc_cwd_pre[1] = 1; ulvc_cwd_pre[2] = 2;
206 ulvc_cwd_pre[3] = 4; ulvc_cwd_pre[4] = 4;
207 ulvc_cwd_pre_len[0] = 0; ulvc_cwd_pre_len[1] = 1;
208 ulvc_cwd_pre_len[2] = 2;
209 ulvc_cwd_pre_len[3] = 3; ulvc_cwd_pre_len[4] = 3;
210 ulvc_cwd_suf[0] = 0; ulvc_cwd_suf[1] = 0; ulvc_cwd_suf[2] = 0;
211 ulvc_cwd_suf[3] = 0; ulvc_cwd_suf[4] = 1;
212 ulvc_cwd_suf_len[0] = 0; ulvc_cwd_suf_len[1] = 0;
213 ulvc_cwd_suf_len[2] = 0;
214 ulvc_cwd_suf_len[3] = 1; ulvc_cwd_suf_len[4] = 1;
215 for (int i = 5; i < 33; ++i)
216 {
217 ulvc_cwd_pre[i] = 0;
218 ulvc_cwd_pre_len[i] = 3;
219 ulvc_cwd_suf[i] = (ui32)(i-5);
220 ulvc_cwd_suf_len[i] = 5;
221 }
222 return true;
223 }
224
227 static bool tables_initialized = false;
228 static std::once_flag tables_initialized_flag;
229 std::call_once(tables_initialized_flag, []() {
230 memset(vlc_tbl0, 0, 2048 * sizeof(ui32));
231 memset(vlc_tbl1, 0, 2048 * sizeof(ui32));
232 tables_initialized = vlc_init_tables();
233 tables_initialized = tables_initialized && uvlc_init_tables();
234 });
235 return tables_initialized;
236 }
237
239 //
241 struct mel_struct {
242 //storage
243 ui8* buf; //pointer to data buffer
244 ui32 pos; //position of next writing within buf
245 ui32 buf_size; //size of buffer, which we must not exceed
246
247 // all these can be replaced by bytes
248 int remaining_bits; //number of empty bits in tmp
249 int tmp; //temporary storage of coded bits
250 int run; //number of 0 run
251 int k; //state
252 int threshold; //threshold where one bit must be coded
253 };
254
256 static inline void
257 mel_init(mel_struct* melp, ui32 buffer_size, ui8* data)
258 {
259 melp->buf = data;
260 melp->pos = 0;
261 melp->buf_size = buffer_size;
262 melp->remaining_bits = 8;
263 melp->tmp = 0;
264 melp->run = 0;
265 melp->k = 0;
266 melp->threshold = 1; // this is 1 << mel_exp[melp->k];
267 }
268
270 static inline void
271 mel_emit_bit(mel_struct* melp, int v)
272 {
273 melp->tmp = (melp->tmp << 1) + v;
274 melp->remaining_bits--;
275 if (melp->remaining_bits == 0) {
276 melp->buf[melp->pos++] = (ui8)melp->tmp;
277 melp->remaining_bits = (melp->tmp == 0xFF ? 7 : 8);
278 melp->tmp = 0;
279 }
280 }
281
283 static inline void
284 mel_encode(mel_struct* melp, bool bit)
285 {
286 //MEL exponent
287 static const int mel_exp[13] = {0,0,0,1,1,1,2,2,2,3,3,4,5};
288
289 if (bit == false) {
290 ++melp->run;
291 if (melp->run >= melp->threshold) {
292 mel_emit_bit(melp, 1);
293 melp->run = 0;
294 melp->k = ojph_min(12, melp->k + 1);
295 melp->threshold = 1 << mel_exp[melp->k];
296 }
297 } else {
298 mel_emit_bit(melp, 0);
299 int t = mel_exp[melp->k];
300 while (t > 0) {
301 mel_emit_bit(melp, (melp->run >> --t) & 1);
302 }
303 melp->run = 0;
304 melp->k = ojph_max(0, melp->k - 1);
305 melp->threshold = 1 << mel_exp[melp->k];
306 }
307 }
308
310 //
312 struct vlc_struct_avx2 {
313 //storage
314 ui8* buf; //pointer to data buffer
315 ui32 pos; //position of next writing within buf
316 ui32 buf_size; //size of buffer, which we must not exceed
317
318 int used_bits; //number of occupied bits in tmp
319 ui64 tmp; //temporary storage of coded bits
320 bool last_greater_than_8F; //true if last byte us greater than 0x8F
321 };
322
324 static inline void
325 vlc_init(vlc_struct_avx2* vlcp, ui32 buffer_size, ui8* data)
326 {
327 vlcp->buf = data + buffer_size - 1; //points to last byte
328 vlcp->pos = 1; //locations will be all -pos
329 vlcp->buf_size = buffer_size;
330
331 vlcp->buf[0] = 0xFF;
332 vlcp->used_bits = 4;
333 vlcp->tmp = 0xF;
334 vlcp->last_greater_than_8F = true;
335 }
336
338 static inline void
339 vlc_encode(vlc_struct_avx2* vlcp, ui32 cwd, int cwd_len)
340 {
341 vlcp->tmp |= (ui64)cwd << vlcp->used_bits;
342 vlcp->used_bits += cwd_len;
343
344 while (vlcp->used_bits >= 8) {
345 ui8 tmp;
346
347 if (unlikely(vlcp->last_greater_than_8F)) {
348 tmp = vlcp->tmp & 0x7F;
349
350 if (likely(tmp != 0x7F)) {
351 tmp = vlcp->tmp & 0xFF;
352 *(vlcp->buf - vlcp->pos) = tmp;
353 vlcp->last_greater_than_8F = tmp > 0x8F;
354 vlcp->tmp >>= 8;
355 vlcp->used_bits -= 8;
356 } else {
357 *(vlcp->buf - vlcp->pos) = tmp;
358 vlcp->last_greater_than_8F = false;
359 vlcp->tmp >>= 7;
360 vlcp->used_bits -= 7;
361 }
362
363 } else {
364 tmp = vlcp->tmp & 0xFF;
365 *(vlcp->buf - vlcp->pos) = tmp;
366 vlcp->last_greater_than_8F = tmp > 0x8F;
367 vlcp->tmp >>= 8;
368 vlcp->used_bits -= 8;
369 }
370
371 vlcp->pos++;
372 }
373 }
374
376 //
378 static inline void
379 terminate_mel_vlc(mel_struct* melp, vlc_struct_avx2* vlcp)
380 {
381 if (melp->run > 0)
382 mel_emit_bit(melp, 1);
383
384 if (vlcp->last_greater_than_8F && (vlcp->tmp & 0x7f) == 0x7f) {
385 *(vlcp->buf - vlcp->pos) = 0x7f;
386 vlcp->pos++;
387 vlcp->tmp >>= 7;
388 vlcp->used_bits -= 7;
389 }
390
391 melp->tmp = melp->tmp << melp->remaining_bits;
392 int mel_mask = (0xFF << melp->remaining_bits) & 0xFF;
393 int vlc_mask = 0xFF >> (8 - vlcp->used_bits);
394 if ((mel_mask | vlc_mask) == 0)
395 return; //last mel byte cannot be 0xFF, since then
396 //melp->remaining_bits would be < 8
397 if (melp->pos >= melp->buf_size)
398 OJPH_ERROR(0x00020003, "mel encoder's buffer is full");
399 ui8 vlcp_tmp = (ui8)vlcp->tmp;
400 int fuse = melp->tmp | vlcp_tmp;
401 if ( ( ((fuse ^ melp->tmp) & mel_mask)
402 | ((fuse ^ vlcp_tmp) & vlc_mask) ) == 0
403 && (fuse != 0xFF) && vlcp->pos > 1)
404 {
405 melp->buf[melp->pos++] = (ui8)fuse;
406 }
407 else
408 {
409 if (vlcp->pos >= vlcp->buf_size)
410 OJPH_ERROR(0x00020004, "vlc encoder's buffer is full");
411 melp->buf[melp->pos++] = (ui8)melp->tmp; //melp->tmp cannot be 0xFF
412 *(vlcp->buf - vlcp->pos) = (ui8)vlcp_tmp;
413 vlcp->pos++;
414 }
415 }
416
418//
420 struct ms_struct {
421 //storage
422 ui8* buf; //pointer to data buffer
423 ui32 pos; //position of next writing within buf
424 ui32 buf_size; //size of buffer, which we must not exceed
425
426 int max_bits; //maximum number of bits that can be store in tmp
427 int used_bits; //number of occupied bits in tmp
428 ui32 tmp; //temporary storage of coded bits
429 };
430
432 static inline void
433 ms_init(ms_struct* msp, ui32 buffer_size, ui8* data)
434 {
435 msp->buf = data;
436 msp->pos = 0;
437 msp->buf_size = buffer_size;
438 msp->max_bits = 8;
439 msp->used_bits = 0;
440 msp->tmp = 0;
441 }
442
444 static inline void
445 ms_encode(ms_struct* msp, ui64 cwd, int cwd_len)
446 {
447 while (cwd_len > 0)
448 {
449 if (msp->pos >= msp->buf_size)
450 OJPH_ERROR(0x00020005, "magnitude sign encoder's buffer is full");
451 int t = ojph_min(msp->max_bits - msp->used_bits, cwd_len);
452 msp->tmp |= ((ui32)(cwd & ((1U << t) - 1))) << msp->used_bits;
453 msp->used_bits += t;
454 cwd >>= t;
455 cwd_len -= t;
456 if (msp->used_bits >= msp->max_bits)
457 {
458 msp->buf[msp->pos++] = (ui8)msp->tmp;
459 msp->max_bits = (msp->tmp == 0xFF) ? 7 : 8;
460 msp->tmp = 0;
461 msp->used_bits = 0;
462 }
463 }
464 }
465
467 static inline void
469 {
470 if (msp->used_bits)
471 {
472 int t = msp->max_bits - msp->used_bits; //unused bits
473 msp->tmp |= (0xFF & ((1U << t) - 1)) << msp->used_bits;
474 msp->used_bits += t;
475 if (msp->tmp != 0xFF)
476 {
477 if (msp->pos >= msp->buf_size)
478 OJPH_ERROR(0x00020006, "magnitude sign encoder's buffer is full");
479 msp->buf[msp->pos++] = (ui8)msp->tmp;
480 }
481 }
482 else if (msp->max_bits == 7)
483 msp->pos--;
484 }
485
486#define ZERO _mm256_setzero_si256()
487#define ONE _mm256_set1_epi32(1)
488
489// https://stackoverflow.com/a/58827596
490inline __m256i avx2_lzcnt_epi32(__m256i v) {
491 // prevent value from being rounded up to the next power of two
492 v = _mm256_andnot_si256(_mm256_srli_epi32(v, 8), v); // keep 8 MSB
493
494 v = _mm256_castps_si256(_mm256_cvtepi32_ps(v)); // convert an integer to float
495 v = _mm256_srli_epi32(v, 23); // shift down the exponent
496 v = _mm256_subs_epu16(_mm256_set1_epi32(158), v); // undo bias
497 v = _mm256_min_epi16(v, _mm256_set1_epi32(32)); // clamp at 32
498
499 return v;
500}
501
502inline __m256i avx2_cmpneq_epi32(__m256i v, __m256i v2) {
503 return _mm256_xor_si256(_mm256_cmpeq_epi32(v, v2), _mm256_set1_epi32((int32_t)0xffffffff));
504}
505
506static void proc_pixel(__m256i *src_vec, ui32 p,
507 __m256i *eq_vec, __m256i *s_vec,
508 __m256i &rho_vec, __m256i &e_qmax_vec)
509{
510 __m256i val_vec[4];
511 __m256i _eq_vec[4];
512 __m256i _s_vec[4];
513 __m256i _rho_vec[4];
514
515 for (ui32 i = 0; i < 4; ++i) {
516 /* val = t + t; //multiply by 2 and get rid of sign */
517 val_vec[i] = _mm256_add_epi32(src_vec[i], src_vec[i]);
518
519 /* val >>= p; // 2 \mu_p + x */
520 val_vec[i] = _mm256_srli_epi32(val_vec[i], (int)p);
521
522 /* val &= ~1u; // 2 \mu_p */
523 val_vec[i] = _mm256_and_si256(val_vec[i], _mm256_set1_epi32((int)~1u));
524
525 /* if (val) { */
526 const __m256i val_notmask = avx2_cmpneq_epi32(val_vec[i], ZERO);
527
528 /* rho[i] = 1 << i;
529 * rho is processed below.
530 */
531
532 /* e_q[i] = 32 - (int)count_leading_ZEROs(--val); //2\mu_p - 1 */
533 val_vec[i] = _mm256_sub_epi32(val_vec[i], ONE);
534 _eq_vec[i] = avx2_lzcnt_epi32(val_vec[i]);
535 _eq_vec[i] = _mm256_sub_epi32(_mm256_set1_epi32(32), _eq_vec[i]);
536
537 /* e_qmax[i] = ojph_max(e_qmax[i], e_q[j]);
538 * e_qmax is processed below
539 */
540
541 /* s[0] = --val + (t >> 31); //v_n = 2(\mu_p-1) + s_n */
542 val_vec[i] = _mm256_sub_epi32(val_vec[i], ONE);
543 _s_vec[i] = _mm256_srli_epi32(src_vec[i], 31);
544 _s_vec[i] = _mm256_add_epi32(_s_vec[i], val_vec[i]);
545
546 _eq_vec[i] = _mm256_and_si256(_eq_vec[i], val_notmask);
547 _s_vec[i] = _mm256_and_si256(_s_vec[i], val_notmask);
548 val_vec[i] = _mm256_srli_epi32(val_notmask, 31);
549 /* } */
550 }
551
552 const __m256i idx = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0);
553
554 /* Reorder from
555 * *_vec[0]:[0, 0], [0, 1], [0, 2], [0, 3], [0, 4], [0, 5], [0, 6], [0, 7]
556 * *_vec[1]:[1, 0], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5],.[1, 6], [1, 7]
557 * *_vec[2]:[0, 8], [0, 9], [0,10], [0,11], [0,12], [0,13], [0,14], [0,15]
558 * *_vec[3]:[1, 8], [1, 9], [1,10], [1,11], [1,12], [1,13], [1,14], [1,15]
559 * to
560 * *_vec[0]:[0, 0], [0, 2], [0, 4], [0, 6], [0, 8], [0,10], [0,12], [0,14]
561 * *_vec[1]:[1, 0], [1, 2], [1, 4], [1, 6], [1, 8], [1,10], [1,12], [1,14]
562 * *_vec[2]:[0, 1], [0, 3], [0, 5], [0, 7], [0, 9], [0,11], [0,13], [0,15]
563 * *_vec[3]:[1, 1], [1, 3], [1, 5], [1, 7], [1, 9], [1,11], [1,13], [1,15]
564 */
565 __m256i tmp1, tmp2;
566 for (ui32 i = 0; i < 2; ++i) {
567 tmp1 = _mm256_permutevar8x32_epi32(_eq_vec[0 + i], idx);
568 tmp2 = _mm256_permutevar8x32_epi32(_eq_vec[2 + i], idx);
569 eq_vec[0 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (0 << 0) + (2 << 4));
570 eq_vec[2 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (1 << 0) + (3 << 4));
571
572 tmp1 = _mm256_permutevar8x32_epi32(_s_vec[0 + i], idx);
573 tmp2 = _mm256_permutevar8x32_epi32(_s_vec[2 + i], idx);
574 s_vec[0 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (0 << 0) + (2 << 4));
575 s_vec[2 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (1 << 0) + (3 << 4));
576
577 tmp1 = _mm256_permutevar8x32_epi32(val_vec[0 + i], idx);
578 tmp2 = _mm256_permutevar8x32_epi32(val_vec[2 + i], idx);
579 _rho_vec[0 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (0 << 0) + (2 << 4));
580 _rho_vec[2 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (1 << 0) + (3 << 4));
581 }
582
583 e_qmax_vec = _mm256_max_epi32(eq_vec[0], eq_vec[1]);
584 e_qmax_vec = _mm256_max_epi32(e_qmax_vec, eq_vec[2]);
585 e_qmax_vec = _mm256_max_epi32(e_qmax_vec, eq_vec[3]);
586 _rho_vec[1] = _mm256_slli_epi32(_rho_vec[1], 1);
587 _rho_vec[2] = _mm256_slli_epi32(_rho_vec[2], 2);
588 _rho_vec[3] = _mm256_slli_epi32(_rho_vec[3], 3);
589 rho_vec = _mm256_or_si256(_rho_vec[0], _rho_vec[1]);
590 rho_vec = _mm256_or_si256(rho_vec, _rho_vec[2]);
591 rho_vec = _mm256_or_si256(rho_vec, _rho_vec[3]);
592}
593
594/* from [0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, ...]
595 * [0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, ...]
596 * [0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, ...]
597 * [0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, ...]
598 *
599 * to [0x00, 0x10, 0x20, 0x30, 0x01, 0x11, 0x21, 0x31,
600 * 0x02, 0x12, 0x22, 0x32, 0x03, 0x13, 0x23, 0x33]
601 *
602 * [0x04, 0x14, 0x24, 0x34, 0x05, 0x15, 0x25, 0x35,
603 * 0x06, 0x16, 0x26, 0x36, 0x07, 0x17, 0x27, 0x37]
604 *
605 * [..]
606 */
607static void rotate_matrix(__m256i *matrix)
608{
609 __m256i tmp1 = _mm256_unpacklo_epi32(matrix[0], matrix[1]);
610 __m256i tmp2 = _mm256_unpacklo_epi32(matrix[2], matrix[3]);
611 __m256i tmp3 = _mm256_unpackhi_epi32(matrix[0], matrix[1]);
612 __m256i tmp4 = _mm256_unpackhi_epi32(matrix[2], matrix[3]);
613
614 matrix[0] = _mm256_unpacklo_epi64(tmp1, tmp2);
615 matrix[1] = _mm256_unpacklo_epi64(tmp3, tmp4);
616 matrix[2] = _mm256_unpackhi_epi64(tmp1, tmp2);
617 matrix[3] = _mm256_unpackhi_epi64(tmp3, tmp4);
618
619 tmp1 = _mm256_permute2x128_si256(matrix[0], matrix[2], 0x20);
620 matrix[2] = _mm256_permute2x128_si256(matrix[0], matrix[2], 0x31);
621 matrix[0] = tmp1;
622
623 tmp1 = _mm256_permute2x128_si256(matrix[1], matrix[3], 0x20);
624 matrix[3] = _mm256_permute2x128_si256(matrix[1], matrix[3], 0x31);
625 matrix[1] = tmp1;
626}
627
628static void proc_ms_encode(ms_struct *msp,
629 __m256i &tuple_vec,
630 __m256i &uq_vec,
631 __m256i &rho_vec,
632 __m256i *s_vec)
633{
634 __m256i m_vec[4];
635
636 /* Prepare parameters for ms_encode */
637 /* m = (rho[i] & 1) ? Uq[i] - ((tuple[i] & 1) >> 0) : 0; */
638 auto tmp = _mm256_and_si256(tuple_vec, ONE);
639 tmp = _mm256_sub_epi32(uq_vec, tmp);
640 auto tmp1 = _mm256_and_si256(rho_vec, ONE);
641 auto mask = avx2_cmpneq_epi32(tmp1, ZERO);
642 m_vec[0] = _mm256_and_si256(mask, tmp);
643
644 /* m = (rho[i] & 2) ? Uq[i] - ((tuple[i] & 2) >> 1) : 0; */
645 tmp = _mm256_and_si256(tuple_vec, _mm256_set1_epi32(2));
646 tmp = _mm256_srli_epi32(tmp, 1);
647 tmp = _mm256_sub_epi32(uq_vec, tmp);
648 tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(2));
649 mask = avx2_cmpneq_epi32(tmp1, ZERO);
650 m_vec[1] = _mm256_and_si256(mask, tmp);
651
652 /* m = (rho[i] & 4) ? Uq[i] - ((tuple[i] & 4) >> 2) : 0; */
653 tmp = _mm256_and_si256(tuple_vec, _mm256_set1_epi32(4));
654 tmp = _mm256_srli_epi32(tmp, 2);
655 tmp = _mm256_sub_epi32(uq_vec, tmp);
656 tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(4));
657 mask = avx2_cmpneq_epi32(tmp1, ZERO);
658 m_vec[2] = _mm256_and_si256(mask, tmp);
659
660 /* m = (rho[i] & 8) ? Uq[i] - ((tuple[i] & 8) >> 3) : 0; */
661 tmp = _mm256_and_si256(tuple_vec, _mm256_set1_epi32(8));
662 tmp = _mm256_srli_epi32(tmp, 3);
663 tmp = _mm256_sub_epi32(uq_vec, tmp);
664 tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(8));
665 mask = avx2_cmpneq_epi32(tmp1, ZERO);
666 m_vec[3] = _mm256_and_si256(mask, tmp);
667
668 rotate_matrix(m_vec);
669 /* s_vec from
670 * s_vec[0]:[0, 0], [0, 2] ... [0,14], [0, 16], [0, 18] ... [0,30]
671 * s_vec[1]:[1, 0], [1, 2] ... [1,14], [1, 16], [1, 18] ... [1,30]
672 * s_vec[2]:[0, 1], [0, 3] ... [0,15], [0, 17], [0, 19] ... [0,31]
673 * s_vec[3]:[1, 1], [1, 3] ... [1,15], [1, 17], [1, 19] ... [1,31]
674 * to
675 * s_vec[0]:[0, 0], [1, 0], [0, 1], [1, 1], [0, 2], [1, 2]...[0, 7], [1, 7]
676 * s_vec[1]:[0, 8], [1, 8], [0, 9], [1, 9], [0,10], [1,10]...[0,15], [1,15]
677 * s_vec[2]:[0,16], [1,16], [0,17], [1,17], [0,18], [1,18]...[0,23], [1,23]
678 * s_vec[3]:[0,24], [1,24], [0,25], [1,25], [0,26], [1,26]...[0,31], [1,31]
679 */
680 rotate_matrix(s_vec);
681
682 ui32 cwd[8];
683 int cwd_len[8];
684 ui64 _cwd = 0;
685 int _cwd_len = 0;
686
687 /* Each iteration process 8 bytes * 2 lines */
688 for (ui32 i = 0; i < 4; ++i) {
689 /* cwd = s[i * 4 + 0] & ((1U << m) - 1)
690 * cwd_len = m
691 */
692 _mm256_storeu_si256((__m256i *)cwd_len, m_vec[i]);
693 tmp = _mm256_sllv_epi32(ONE, m_vec[i]);
694 tmp = _mm256_sub_epi32(tmp, ONE);
695 tmp = _mm256_and_si256(tmp, s_vec[i]);
696 _mm256_storeu_si256((__m256i*)cwd, tmp);
697
698 for (ui32 j = 0; j < 4; ++j) {
699 ui32 idx = j * 2;
700 _cwd = cwd[idx];
701 _cwd_len = cwd_len[idx];
702 _cwd |= ((ui64)cwd[idx + 1]) << _cwd_len;
703 _cwd_len += cwd_len[idx + 1];
704 ms_encode(msp, _cwd, _cwd_len);
705 }
706 }
707}
708
709static __m256i cal_eps_vec(__m256i *eq_vec, __m256i &u_q_vec,
710 __m256i &e_qmax_vec)
711{
712 /* if (u_q[i] > 0) {
713 * eps[i] |= (e_q[i * 4 + 0] == e_qmax[i]);
714 * eps[i] |= (e_q[i * 4 + 1] == e_qmax[i]) << 1;
715 * eps[i] |= (e_q[i * 4 + 2] == e_qmax[i]) << 2;
716 * eps[i] |= (e_q[i * 4 + 3] == e_qmax[i]) << 3;
717 * }
718 */
719 auto u_q_mask = _mm256_cmpgt_epi32(u_q_vec, ZERO);
720
721 auto mask = _mm256_cmpeq_epi32(eq_vec[0], e_qmax_vec);
722 auto eps_vec = _mm256_srli_epi32(mask, 31);
723
724 mask = _mm256_cmpeq_epi32(eq_vec[1], e_qmax_vec);
725 auto tmp = _mm256_srli_epi32(mask, 31);
726 tmp = _mm256_slli_epi32(tmp, 1);
727 eps_vec = _mm256_or_si256(eps_vec, tmp);
728
729 mask = _mm256_cmpeq_epi32(eq_vec[2], e_qmax_vec);
730 tmp = _mm256_srli_epi32(mask, 31);
731 tmp = _mm256_slli_epi32(tmp, 2);
732 eps_vec = _mm256_or_si256(eps_vec, tmp);
733
734 mask = _mm256_cmpeq_epi32(eq_vec[3], e_qmax_vec);
735 tmp = _mm256_srli_epi32(mask, 31);
736 tmp = _mm256_slli_epi32(tmp, 3);
737 eps_vec = _mm256_or_si256(eps_vec, tmp);
738
739 return _mm256_and_si256(u_q_mask, eps_vec);
740}
741
742static void update_lep(ui32 x, __m256i &prev_e_val_vec,
743 __m256i *eq_vec, __m256i *e_val_vec,
744 const __m256i left_shift)
745{
746 /* lep[0] = ojph_max(lep[0], (ui8)e_q[1]); lep++;
747 * lep[0] = (ui8)e_q[3];
748 * Compare e_q[1] with e_q[3] of the prevous round.
749 */
750 auto tmp = _mm256_permutevar8x32_epi32(eq_vec[3], left_shift);
751 tmp = _mm256_insert_epi32(tmp, _mm_cvtsi128_si32(_mm256_castsi256_si128(prev_e_val_vec)), 0);
752 prev_e_val_vec = _mm256_insert_epi32(ZERO, _mm256_extract_epi32(eq_vec[3], 7), 0);
753 e_val_vec[x] = _mm256_max_epi32(eq_vec[1], tmp);
754}
755
756
757static void update_lcxp(ui32 x, __m256i &prev_cx_val_vec,
758 __m256i &rho_vec, __m256i *cx_val_vec,
759 const __m256i left_shift)
760{
761 /* lcxp[0] = (ui8)(lcxp[0] | (ui8)((rho[0] & 2) >> 1)); lcxp++;
762 * lcxp[0] = (ui8)((rho[0] & 8) >> 3);
763 * Or (rho[0] & 2) and (rho[0] of the previous round & 8).
764 */
765 auto tmp = _mm256_permutevar8x32_epi32(rho_vec, left_shift);
766 tmp = _mm256_insert_epi32(tmp, _mm_cvtsi128_si32(_mm256_castsi256_si128(prev_cx_val_vec)), 0);
767 prev_cx_val_vec = _mm256_insert_epi32(ZERO, _mm256_extract_epi32(rho_vec, 7), 0);
768
769 tmp = _mm256_and_si256(tmp, _mm256_set1_epi32(8));
770 tmp = _mm256_srli_epi32(tmp, 3);
771
772 auto tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(2));
773 tmp1 = _mm256_srli_epi32(tmp1, 1);
774 cx_val_vec[x] = _mm256_or_si256(tmp, tmp1);
775}
776
777static __m256i cal_tuple(__m256i &cq_vec, __m256i &rho_vec,
778 __m256i &eps_vec, ui32 *vlc_tbl)
779{
780 /* tuple[i] = vlc_tbl1[(c_q[i] << 8) + (rho[i] << 4) + eps[i]]; */
781 auto tmp = _mm256_slli_epi32(cq_vec, 8);
782 auto tmp1 = _mm256_slli_epi32(rho_vec, 4);
783 tmp = _mm256_add_epi32(tmp, tmp1);
784 tmp = _mm256_add_epi32(tmp, eps_vec);
785 return _mm256_i32gather_epi32((const int *)vlc_tbl, tmp, 4);
786}
787
788static __m256i proc_cq1(ui32 x, __m256i *cx_val_vec, __m256i &rho_vec,
789 const __m256i right_shift)
790{
791 ojph_unused(x);
792 ojph_unused(cx_val_vec);
793 ojph_unused(right_shift);
794
795 /* c_q[i + 1] = (rho[i] >> 1) | (rho[i] & 1); */
796 auto tmp = _mm256_srli_epi32(rho_vec, 1);
797 auto tmp1 = _mm256_and_si256(rho_vec, ONE);
798 return _mm256_or_si256(tmp, tmp1);
799}
800
801static __m256i proc_cq2(ui32 x, __m256i *cx_val_vec, __m256i &rho_vec,
802 const __m256i right_shift)
803{
804 // c_q[i + 1] = (lcxp[i + 1] + (lcxp[i + 2] << 2))
805 // | (((rho[i] & 4) >> 1) | ((rho[i] & 8) >> 2));
806 auto lcxp1_vec = _mm256_permutevar8x32_epi32(cx_val_vec[x], right_shift);
807 auto tmp = _mm256_permutevar8x32_epi32(lcxp1_vec, right_shift);
808
809#ifdef OJPH_ARCH_X86_64
810 tmp = _mm256_insert_epi64(tmp,
811 _mm_cvtsi128_si64(_mm256_castsi256_si128(cx_val_vec[x + 1])), 3);
812#elif (defined OJPH_ARCH_I386)
813 int lsb = _mm_cvtsi128_si32(_mm256_castsi256_si128(cx_val_vec[x + 1]));
814 tmp = _mm256_insert_epi32(tmp, lsb, 6);
815 int msb = _mm_extract_epi32(_mm256_castsi256_si128(cx_val_vec[x + 1]), 1);
816 tmp = _mm256_insert_epi32(tmp, msb, 7);
817#else
818 #error Error unsupport compiler
819#endif
820 tmp = _mm256_slli_epi32(tmp, 2);
821 auto tmp1 = _mm256_insert_epi32(lcxp1_vec,
822 _mm_cvtsi128_si32(_mm256_castsi256_si128(cx_val_vec[x + 1])), 7);
823 tmp = _mm256_add_epi32(tmp1, tmp);
824
825 tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(4));
826 tmp1 = _mm256_srli_epi32(tmp1, 1);
827 tmp = _mm256_or_si256(tmp, tmp1);
828
829 tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(8));
830 tmp1 = _mm256_srli_epi32(tmp1, 2);
831
832 return _mm256_or_si256(tmp, tmp1);
833}
834
835using fn_proc_cq = __m256i (*)(ui32, __m256i *, __m256i &, const __m256i);
836
837static void proc_mel_encode1(mel_struct *melp, __m256i &cq_vec,
838 __m256i &rho_vec, __m256i u_q_vec, ui32 ignore,
839 const __m256i right_shift)
840{
841 int32_t mel_need_encode[8];
842 int32_t mel_need_encode2[8];
843 int32_t mel_bit[8];
844 int32_t mel_bit2[8];
845 /* Prepare mel_encode params */
846 /* if (c_q[i] == 0) { */
847 _mm256_storeu_si256((__m256i *)mel_need_encode, _mm256_cmpeq_epi32(cq_vec, ZERO));
848 /* mel_encode(&mel, rho[i] != 0); */
849 _mm256_storeu_si256((__m256i*)mel_bit, _mm256_srli_epi32(avx2_cmpneq_epi32(rho_vec, ZERO), 31));
850 /* } */
851
852 /* mel_encode(&mel, ojph_min(u_q[i], u_q[i + 1]) > 2); */
853 auto tmp = _mm256_permutevar8x32_epi32(u_q_vec, right_shift);
854 auto tmp1 = _mm256_min_epi32(u_q_vec, tmp);
855 _mm256_storeu_si256((__m256i*)mel_bit2, _mm256_srli_epi32(_mm256_cmpgt_epi32(tmp1, _mm256_set1_epi32(2)), 31));
856
857 /* if (u_q[i] > 0 && u_q[i + 1] > 0) { } */
858 auto need_encode2 = _mm256_cmpgt_epi32(u_q_vec, ZERO);
859 _mm256_storeu_si256((__m256i*)mel_need_encode2, _mm256_and_si256(need_encode2, _mm256_cmpgt_epi32(tmp, ZERO)));
860
861 ui32 i_max = 8 - (ignore / 2);
862
863 for (ui32 i = 0; i < i_max; i += 2) {
864 if (mel_need_encode[i]) {
865 mel_encode(melp, mel_bit[i]);
866 }
867
868 if (i + 1 < i_max) {
869 if (mel_need_encode[i + 1]) {
870 mel_encode(melp, mel_bit[i + 1]);
871 }
872 }
873
874 if (mel_need_encode2[i]) {
875 mel_encode(melp, mel_bit2[i]);
876 }
877 }
878}
879
880static void proc_mel_encode2(mel_struct *melp, __m256i &cq_vec,
881 __m256i &rho_vec, __m256i u_q_vec, ui32 ignore,
882 const __m256i right_shift)
883{
884 ojph_unused(u_q_vec);
885 ojph_unused(right_shift);
886 int32_t mel_need_encode[8];
887 int32_t mel_bit[8];
888
889 /* Prepare mel_encode params */
890 /* if (c_q[i] == 0) { */
891 _mm256_storeu_si256((__m256i*)mel_need_encode, _mm256_cmpeq_epi32(cq_vec, ZERO));
892 /* mel_encode(&mel, rho[i] != 0); */
893 _mm256_storeu_si256((__m256i*)mel_bit, _mm256_srli_epi32(avx2_cmpneq_epi32(rho_vec, ZERO), 31));
894 /* } */
895
896 ui32 i_max = 8 - (ignore / 2);
897
898 for (ui32 i = 0; i < i_max; ++i) {
899 if (mel_need_encode[i]) {
900 mel_encode(melp, mel_bit[i]);
901 }
902 }
903}
904
905using fn_proc_mel_encode = void (*)(mel_struct *, __m256i &, __m256i &,
906 __m256i, ui32, const __m256i);
907
908static void proc_vlc_encode1(vlc_struct_avx2 *vlcp, ui32 *tuple,
909 ui32 *u_q, ui32 ignore)
910{
911 ui32 i_max = 8 - (ignore / 2);
912
913 for (ui32 i = 0; i < i_max; i += 2) {
914 /* 7 bits */
915 ui32 val = tuple[i + 0] >> 4;
916 int size = tuple[i + 0] & 7;
917
918 if (i + 1 < i_max) {
919 /* 7 bits */
920 val |= (tuple[i + 1] >> 4) << size;
921 size += tuple[i + 1] & 7;
922 }
923
924 if (u_q[i] > 2 && u_q[i + 1] > 2) {
925 /* 3 bits */
926 val |= (ulvc_cwd_pre[u_q[i] - 2]) << size;
927 size += ulvc_cwd_pre_len[u_q[i] - 2];
928
929 /* 3 bits */
930 val |= (ulvc_cwd_pre[u_q[i + 1] - 2]) << size;
931 size += ulvc_cwd_pre_len[u_q[i + 1] - 2];
932
933 /* 5 bits */
934 val |= (ulvc_cwd_suf[u_q[i] - 2]) << size;
935 size += ulvc_cwd_suf_len[u_q[i] - 2];
936
937 /* 5 bits */
938 val |= (ulvc_cwd_suf[u_q[i + 1] - 2]) << size;
939 size += ulvc_cwd_suf_len[u_q[i + 1] - 2];
940
941 } else if (u_q[i] > 2 && u_q[i + 1] > 0) {
942 /* 3 bits */
943 val |= (ulvc_cwd_pre[u_q[i]]) << size;
944 size += ulvc_cwd_pre_len[u_q[i]];
945
946 /* 1 bit */
947 val |= (u_q[i + 1] - 1) << size;
948 size += 1;
949
950 /* 5 bits */
951 val |= (ulvc_cwd_suf[u_q[i]]) << size;
952 size += ulvc_cwd_suf_len[u_q[i]];
953
954 } else {
955 /* 3 bits */
956 val |= (ulvc_cwd_pre[u_q[i]]) << size;
957 size += ulvc_cwd_pre_len[u_q[i]];
958
959 /* 3 bits */
960 val |= (ulvc_cwd_pre[u_q[i + 1]]) << size;
961 size += ulvc_cwd_pre_len[u_q[i + 1]];
962
963 /* 5 bits */
964 val |= (ulvc_cwd_suf[u_q[i]]) << size;
965 size += ulvc_cwd_suf_len[u_q[i]];
966
967 /* 5 bits */
968 val |= (ulvc_cwd_suf[u_q[i + 1]]) << size;
969 size += ulvc_cwd_suf_len[u_q[i + 1]];
970 }
971
972 vlc_encode(vlcp, val, size);
973 }
974}
975
976static void proc_vlc_encode2(vlc_struct_avx2 *vlcp, ui32 *tuple,
977 ui32 *u_q, ui32 ignore)
978{
979 ui32 i_max = 8 - (ignore / 2);
980
981 for (ui32 i = 0; i < i_max; i += 2) {
982 /* 7 bits */
983 ui32 val = tuple[i + 0] >> 4;
984 int size = tuple[i + 0] & 7;
985
986 if (i + 1 < i_max) {
987 /* 7 bits */
988 val |= (tuple[i + 1] >> 4) << size;
989 size += tuple[i + 1] & 7;
990 }
991
992 /* 3 bits */
993 val |= ulvc_cwd_pre[u_q[i]] << size;
994 size += ulvc_cwd_pre_len[u_q[i]];
995
996 /* 3 bits */
997 val |= (ulvc_cwd_pre[u_q[i + 1]]) << size;
998 size += ulvc_cwd_pre_len[u_q[i + 1]];
999
1000 /* 5 bits */
1001 val |= (ulvc_cwd_suf[u_q[i + 0]]) << size;
1002 size += ulvc_cwd_suf_len[u_q[i + 0]];
1003
1004 /* 5 bits */
1005 val |= (ulvc_cwd_suf[u_q[i + 1]]) << size;
1006 size += ulvc_cwd_suf_len[u_q[i + 1]];
1007
1008 vlc_encode(vlcp, val, size);
1009 }
1010}
1011
1012using fn_proc_vlc_encode = void (*)(vlc_struct_avx2 *, ui32 *, ui32 *, ui32);
1013
1014void ojph_encode_codeblock_avx2(ui32* buf, ui32 missing_msbs,
1015 ui32 num_passes, ui32 _width, ui32 height,
1016 ui32 stride, ui32* lengths,
1017 ojph::mem_elastic_allocator *elastic,
1018 ojph::coded_lists *& coded)
1019{
1020 ojph_unused(num_passes); //currently not used
1021
1022 ui32 width = (_width + 15) & ~15u;
1023 ui32 ignore = width - _width;
1024 const int ms_size = (16384 * 16 + 14) / 15; //more than enough
1025 const int mel_vlc_size = 3072; //more than enough
1026 const int mel_size = 192;
1027 const int vlc_size = mel_vlc_size - mel_size;
1028
1029 ui8 ms_buf[ms_size];
1030 ui8 mel_vlc_buf[mel_vlc_size];
1031 ui8 *mel_buf = mel_vlc_buf;
1032 ui8 *vlc_buf = mel_vlc_buf + mel_size;
1033
1034 mel_struct mel;
1035 mel_init(&mel, mel_size, mel_buf);
1036 vlc_struct_avx2 vlc;
1037 vlc_init(&vlc, vlc_size, vlc_buf);
1038 ms_struct ms;
1039 ms_init(&ms, ms_size, ms_buf);
1040
1041 const ui32 p = 30 - missing_msbs;
1042
1043 //e_val: E values for a line (these are the highest set bit)
1044 //cx_val: is the context values
1045 //Each byte stores the info for the 2 sample. For E, it is maximum
1046 // of the two samples, while for cx, it is the OR of these two samples.
1047 //The maximum is between the pixel at the bottom left of one quad
1048 // and the bottom right of the earlier quad. The same is true for cx.
1049 //For a 1024 pixels, we need 512 bytes, the 2 extra,
1050 // one for the non-existing earlier quad, and one for beyond the
1051 // the end
1052 const __m256i right_shift = _mm256_set_epi32(
1053 0, 7, 6, 5, 4, 3, 2, 1
1054 );
1055
1056 const __m256i left_shift = _mm256_set_epi32(
1057 6, 5, 4, 3, 2, 1, 0, 7
1058 );
1059
1060 ui32 n_loop = (width + 15) / 16;
1061
1062 __m256i e_val_vec[65];
1063 for (ui32 i = 0; i <ojph_min(64, n_loop); ++i) {
1064 e_val_vec[i] = ZERO;
1065 }
1066 __m256i prev_e_val_vec = ZERO;
1067
1068 __m256i cx_val_vec[65];
1069 __m256i prev_cx_val_vec = ZERO;
1070
1071 ui32 prev_cq = 0;
1072
1073 __m256i eq_vec[4];
1074 __m256i s_vec[4];
1075 __m256i src_vec[4];
1076
1077 ui32 *vlc_tbl = vlc_tbl0;
1078 fn_proc_cq proc_cq = proc_cq1;
1079 fn_proc_mel_encode proc_mel_encode = proc_mel_encode1;
1080 fn_proc_vlc_encode proc_vlc_encode = proc_vlc_encode1;
1081
1082 /* 2 lines per iteration */
1083 for (ui32 y = 0; y < height; y += 2)
1084 {
1085 e_val_vec[n_loop] = prev_e_val_vec;
1086 /* lcxp[0] = (ui8)((rho[0] & 8) >> 3); */
1087 __m256i tmp = _mm256_and_si256(prev_cx_val_vec, _mm256_set1_epi32(8));
1088 cx_val_vec[n_loop] = _mm256_srli_epi32(tmp, 3);
1089
1090 prev_e_val_vec = ZERO;
1091 prev_cx_val_vec = ZERO;
1092
1093 ui32 *sp = buf + y * stride;
1094
1095 /* 16 bytes per iteration */
1096 for (ui32 x = 0; x < n_loop; ++x) {
1097
1098 /* t = sp[i]; */
1099 if ((x == (n_loop - 1)) && (_width % 16)) {
1100 ui32 tmp_buf[16] = { 0 };
1101 memcpy(tmp_buf, sp, (_width % 16) * sizeof(ui32));
1102 src_vec[0] = _mm256_loadu_si256((__m256i*)(tmp_buf));
1103 src_vec[2] = _mm256_loadu_si256((__m256i*)(tmp_buf + 8));
1104 if (y + 1 < height) {
1105 memcpy(tmp_buf, sp + stride, (_width % 16) * sizeof(ui32));
1106 src_vec[1] = _mm256_loadu_si256((__m256i*)(tmp_buf));
1107 src_vec[3] = _mm256_loadu_si256((__m256i*)(tmp_buf + 8));
1108 }
1109 else {
1110 src_vec[1] = ZERO;
1111 src_vec[3] = ZERO;
1112 }
1113 }
1114 else {
1115 src_vec[0] = _mm256_loadu_si256((__m256i*)(sp));
1116 src_vec[2] = _mm256_loadu_si256((__m256i*)(sp + 8));
1117
1118 if (y + 1 < height) {
1119 src_vec[1] = _mm256_loadu_si256((__m256i*)(sp + stride));
1120 src_vec[3] = _mm256_loadu_si256((__m256i*)(sp + 8 + stride));
1121 }
1122 else {
1123 src_vec[1] = ZERO;
1124 src_vec[3] = ZERO;
1125 }
1126 sp += 16;
1127 }
1128
1129 /* src_vec layout:
1130 * src_vec[0]:[0, 0],[0, 1],[0, 2],[0, 3],[0, 4],[0, 5],.[0, 6],.[0, 7]
1131 * src_vec[1]:[1, 0],[1, 1],[1, 2],[1, 3],[1, 4],[1, 5],.[1, 6],.[1, 7]
1132 * src_vec[2]:[0, 8],[0, 9],[0,10],[0,11],[0,12],[0,13],.[0,14], [0,15]
1133 * src_vec[3]:[1, 8],[1, 9],[1,10],[1,11],[1,12],[1,13],.[1,14], [1,15]
1134 */
1135 __m256i rho_vec, e_qmax_vec;
1136 proc_pixel(src_vec, p, eq_vec, s_vec, rho_vec, e_qmax_vec);
1137
1138 // max_e[(i + 1) % num] = ojph_max(lep[i + 1], lep[i + 2]) - 1;
1139 tmp = _mm256_permutevar8x32_epi32(e_val_vec[x], right_shift);
1140 tmp = _mm256_insert_epi32(tmp, _mm_cvtsi128_si32(_mm256_castsi256_si128(e_val_vec[x + 1])), 7);
1141
1142 auto max_e_vec = _mm256_max_epi32(tmp, e_val_vec[x]);
1143 max_e_vec = _mm256_sub_epi32(max_e_vec, ONE);
1144
1145 // kappa[i] = (rho[i] & (rho[i] - 1)) ? ojph_max(1, max_e[i]) : 1;
1146 tmp = _mm256_max_epi32(max_e_vec, ONE);
1147 __m256i tmp1 = _mm256_sub_epi32(rho_vec, ONE);
1148 tmp1 = _mm256_and_si256(rho_vec, tmp1);
1149
1150 auto cmp = _mm256_cmpeq_epi32(tmp1, ZERO);
1151 auto kappa_vec1_ = _mm256_and_si256(cmp, ONE);
1152 auto kappa_vec2_ = _mm256_and_si256(_mm256_xor_si256(cmp, _mm256_set1_epi32((int32_t)0xffffffff)), tmp);
1153 const __m256i kappa_vec = _mm256_max_epi32(kappa_vec1_, kappa_vec2_);
1154
1155 /* cq[1 - 16] = cq_vec
1156 * cq[0] = prev_cq_vec[0]
1157 */
1158 tmp = proc_cq(x, cx_val_vec, rho_vec, right_shift);
1159
1160 auto cq_vec = _mm256_permutevar8x32_epi32(tmp, left_shift);
1161 cq_vec = _mm256_insert_epi32(cq_vec, prev_cq, 0);
1162 prev_cq = (ui32)_mm256_extract_epi32(tmp, 7);
1163
1164 update_lep(x, prev_e_val_vec, eq_vec, e_val_vec, left_shift);
1165 update_lcxp(x, prev_cx_val_vec, rho_vec, cx_val_vec, left_shift);
1166
1167 /* Uq[i] = ojph_max(e_qmax[i], kappa[i]); */
1168 /* u_q[i] = Uq[i] - kappa[i]; */
1169 auto uq_vec = _mm256_max_epi32(kappa_vec, e_qmax_vec);
1170 auto u_q_vec = _mm256_sub_epi32(uq_vec, kappa_vec);
1171
1172 auto eps_vec = cal_eps_vec(eq_vec, u_q_vec, e_qmax_vec);
1173 __m256i tuple_vec = cal_tuple(cq_vec, rho_vec, eps_vec, vlc_tbl);
1174 ui32 _ignore = ((n_loop - 1) == x) ? ignore : 0;
1175
1176 proc_mel_encode(&mel, cq_vec, rho_vec, u_q_vec, _ignore,
1177 right_shift);
1178
1179 proc_ms_encode(&ms, tuple_vec, uq_vec, rho_vec, s_vec);
1180
1181 // vlc_encode(&vlc, tuple[i*2+0] >> 8, (tuple[i*2+0] >> 4) & 7);
1182 // vlc_encode(&vlc, tuple[i*2+1] >> 8, (tuple[i*2+1] >> 4) & 7);
1183 ui32 u_q[8];
1184 ui32 tuple[8];
1185 /* The tuple is scaled by 4 due to:
1186 * vlc_encode(&vlc, tuple0 >> 8, (tuple0 >> 4) & 7, true);
1187 * So in the vlc_encode, the tuple will only be scaled by 2.
1188 */
1189 tuple_vec = _mm256_srli_epi32(tuple_vec, 4);
1190 _mm256_storeu_si256((__m256i*)tuple, tuple_vec);
1191 _mm256_storeu_si256((__m256i*)u_q, u_q_vec);
1192
1193 proc_vlc_encode(&vlc, tuple, u_q, _ignore);
1194 }
1195
1196 tmp = _mm256_permutevar8x32_epi32(cx_val_vec[0], right_shift);
1197 tmp = _mm256_slli_epi32(tmp, 2);
1198 tmp = _mm256_add_epi32(tmp, cx_val_vec[0]);
1199 prev_cq = (ui32)_mm_cvtsi128_si32(_mm256_castsi256_si128(tmp));
1200
1201 proc_cq = proc_cq2;
1202 vlc_tbl = vlc_tbl1;
1203 proc_mel_encode = proc_mel_encode2;
1204 proc_vlc_encode = proc_vlc_encode2;
1205 }
1206
1207 ms_terminate(&ms);
1208 terminate_mel_vlc(&mel, &vlc);
1209
1210 //copy to elastic
1211 lengths[0] = mel.pos + vlc.pos + ms.pos;
1212 elastic->get_buffer(mel.pos + vlc.pos + ms.pos, coded);
1213 memcpy(coded->buf, ms.buf, ms.pos);
1214 memcpy(coded->buf + ms.pos, mel.buf, mel.pos);
1215 memcpy(coded->buf + ms.pos + mel.pos, vlc.buf - vlc.pos + 1, vlc.pos);
1216
1217 // put in the interface locator word
1218 ui32 num_bytes = mel.pos + vlc.pos;
1219 coded->buf[lengths[0]-1] = (ui8)(num_bytes >> 4);
1220 coded->buf[lengths[0]-2] = coded->buf[lengths[0]-2] & 0xF0;
1221 coded->buf[lengths[0]-2] =
1222 (ui8)(coded->buf[lengths[0]-2] | (num_bytes & 0xF));
1223
1224 coded->avail_size -= lengths[0];
1225}
1226
1227} /* namespace local */
1228} /* namespace ojph */
1229
1230#endif
void get_buffer(ui32 needed_bytes, coded_lists *&p)
Definition ojph_mem.cpp:113
static bool uvlc_init_tables()
Initializes uvlc_tbl0 and uvlc_tbl1 tables.
static bool vlc_init_tables()
Initializes vlc_tbl0 and vlc_tbl1 tables, from table0.h and table1.h.
ui16 vlc_tbl1[1024]
vlc_tbl1 contains decoding information for non-initial row of quads
ui16 vlc_tbl0[1024]
vlc_tbl0 contains decoding information for initial row of quads
static void ms_terminate(ms_struct *msp)
static void vlc_encode(vlc_struct *vlcp, int cwd, int cwd_len)
static void terminate_mel_vlc(mel_struct *melp, vlc_struct *vlcp)
void ojph_encode_codeblock_avx2(ui32 *buf, ui32 missing_msbs, ui32 num_passes, ui32 width, ui32 height, ui32 stride, ui32 *lengths, ojph::mem_elastic_allocator *elastic, ojph::coded_lists *&coded)
static void mel_init(dec_mel_st *melp, ui8 *bbuf, int lcup, int scup)
Initiates a dec_mel_st structure for MEL decoding and reads some bytes in order to get the read addre...
static void ms_init(ms_struct *msp, ui32 buffer_size, ui8 *data)
static void ms_encode(ms_struct *msp, ui32 cwd, int cwd_len)
static void mel_encode(mel_struct *melp, bool bit)
static void mel_emit_bit(mel_struct *melp, int v)
bool initialize_block_encoder_tables_avx2()
static void vlc_init(vlc_struct *vlcp, ui32 buffer_size, ui8 *data)
uint64_t ui64
Definition ojph_defs.h:56
uint16_t ui16
Definition ojph_defs.h:52
static ui32 population_count(ui32 val)
Definition ojph_arch.h:168
int32_t si32
Definition ojph_defs.h:55
uint32_t ui32
Definition ojph_defs.h:54
uint8_t ui8
Definition ojph_defs.h:50
#define ojph_max(a, b)
Definition ojph_defs.h:73
#define ojph_min(a, b)
Definition ojph_defs.h:76
#define ojph_unused(x)
Definition ojph_defs.h:78
#define OJPH_ERROR(t,...)