OpenJPH
Open-source implementation of JPEG2000 Part-15
Loading...
Searching...
No Matches
ojph_block_decoder_avx2.cpp
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2022, Aous Naman
6// Copyright (c) 2022, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2022, The University of New South Wales, Australia
8// Copyright (c) 2024, Intel Corporation
9// Copyright (c) 2026, Osamu Watanabe
10//
11// Redistribution and use in source and binary forms, with or without
12// modification, are permitted provided that the following conditions are
13// met:
14//
15// 1. Redistributions of source code must retain the above copyright
16// notice, this list of conditions and the following disclaimer.
17//
18// 2. Redistributions in binary form must reproduce the above copyright
19// notice, this list of conditions and the following disclaimer in the
20// documentation and/or other materials provided with the distribution.
21//
22// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
23// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
24// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
25// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
26// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
28// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
29// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
30// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
31// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
32// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33//***************************************************************************/
34// This file is part of the OpenJPH software implementation.
35// File: ojph_block_decoder_avx2.cpp
36//***************************************************************************/
37
38//***************************************************************************/
42
43#include "ojph_arch.h"
44#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
45
46#include <string>
47#include <iostream>
48
49#include <cassert>
50#include <cstring>
51#include "ojph_block_common.h"
52#include "ojph_block_decoder.h"
53#include "ojph_message.h"
54
55#include <immintrin.h>
56
57namespace ojph {
58 namespace local {
59
60 //************************************************************************/
67 struct dec_mel_st {
68 dec_mel_st() : data(NULL), tmp(0), bits(0), size(0), unstuff(false),
69 k(0), num_runs(0), runs(0)
70 {}
71 // data decoding machinery
72 ui8* data;
73 ui64 tmp;
74 int bits;
75 int size;
76 bool unstuff;
77 int k;
78
79 // queue of decoded runs
80 int num_runs;
81 ui64 runs;
82 };
83
84 //************************************************************************/
96 static inline
97 void mel_read(dec_mel_st *melp)
98 {
99 if (melp->bits > 32) //there are enough bits in the tmp variable
100 return; // return without reading new data
101
102 ui32 val = 0xFFFFFFFF; // feed in 0xFF if buffer is exhausted
103 if (melp->size > 4) { // if there is data in the MEL segment
104 memcpy(&val, melp->data, 4); // read 32 bits from MEL data
105 melp->data += 4; // advance pointer
106 melp->size -= 4; // reduce counter
107 }
108 else if (melp->size > 0)
109 { // 4 or less
110 int i = 0;
111 while (melp->size > 1) {
112 ui32 v = *melp->data++; // read one byte at a time
113 ui32 m = ~(0xFFu << i); // mask of location
114 val = (val & m) | (v << i);// put one byte in its correct location
115 --melp->size;
116 i += 8;
117 }
118 // size equal to 1
119 ui32 v = *melp->data++; // the one before the last is different
120 v |= 0xF; // MEL and VLC segments can overlap
121 ui32 m = ~(0xFFu << i);
122 val = (val & m) | (v << i);
123 --melp->size;
124 }
125
126 // next we unstuff them before adding them to the buffer
127 int bits = 32 - melp->unstuff; // number of bits in val, subtract 1 if
128 // the previously read byte requires
129 // unstuffing
130
131 // data is unstuffed and accumulated in t
132 // bits has the number of bits in t
133 ui32 t = val & 0xFF;
134 bool unstuff = ((val & 0xFF) == 0xFF); // true if we need unstuffing
135 bits -= unstuff; // there is one less bit in t if unstuffing is needed
136 t = t << (8 - unstuff); // move up to make room for the next byte
137
138 //this is a repeat of the above
139 t |= (val>>8) & 0xFF;
140 unstuff = (((val >> 8) & 0xFF) == 0xFF);
141 bits -= unstuff;
142 t = t << (8 - unstuff);
143
144 t |= (val>>16) & 0xFF;
145 unstuff = (((val >> 16) & 0xFF) == 0xFF);
146 bits -= unstuff;
147 t = t << (8 - unstuff);
148
149 t |= (val>>24) & 0xFF;
150 melp->unstuff = (((val >> 24) & 0xFF) == 0xFF);
151
152 // move t to tmp, and push the result all the way up, so we read from
153 // the MSB
154 melp->tmp |= ((ui64)t) << (64 - bits - melp->bits);
155 melp->bits += bits; //increment the number of bits in tmp
156 }
157
158 //************************************************************************/
173 static inline
174 void mel_decode(dec_mel_st *melp)
175 {
176 static const int mel_exp[13] = { //MEL exponents
177 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5
178 };
179
180 if (melp->bits < 6) // if there are less than 6 bits in tmp
181 mel_read(melp); // then read from the MEL bitstream
182 // 6 bits is the largest decodable MEL cwd
183
184 //repeat so long that there is enough decodable bits in tmp,
185 // and the runs store is not full (num_runs < 8)
186 while (melp->bits >= 6 && melp->num_runs < 8)
187 {
188 int eval = mel_exp[melp->k]; // number of bits associated with state
189 int run = 0;
190 if (melp->tmp & (1ull<<63)) //The next bit to decode (stored in MSB)
191 { //one is found
192 run = 1 << eval;
193 run--; // consecutive runs of 0 events - 1
194 melp->k = melp->k + 1 < 12 ? melp->k + 1 : 12;//increment, max is 12
195 melp->tmp <<= 1; // consume one bit from tmp
196 melp->bits -= 1;
197 run = run << 1; // a stretch of zeros not terminating in one
198 }
199 else
200 { //0 is found
201 run = (int)(melp->tmp >> (63 - eval)) & ((1 << eval) - 1);
202 melp->k = melp->k - 1 > 0 ? melp->k - 1 : 0; //decrement, min is 0
203 melp->tmp <<= eval + 1; //consume eval + 1 bits (max is 6)
204 melp->bits -= eval + 1;
205 run = (run << 1) + 1; // a stretch of zeros terminating with one
206 }
207 eval = melp->num_runs * 7; // 7 bits per run
208 melp->runs &= ~((ui64)0x3F << eval); // 6 bits are sufficient
209 melp->runs |= ((ui64)run) << eval; // store the value in runs
210 melp->num_runs++; // increment count
211 }
212 }
213
214 //************************************************************************/
224 static inline
225 void mel_init(dec_mel_st *melp, ui8* bbuf, int lcup, int scup)
226 {
227 melp->data = bbuf + lcup - scup; // move the pointer to the start of MEL
228 melp->bits = 0; // 0 bits in tmp
229 melp->tmp = 0; //
230 melp->unstuff = false; // no unstuffing
231 melp->size = scup - 1; // size is the length of MEL+VLC-1
232 melp->k = 0; // 0 for state
233 melp->num_runs = 0; // num_runs is 0
234 melp->runs = 0; //
235
236 //This code is borrowed; original is for a different architecture
237 //These few lines take care of the case where data is not at a multiple
238 // of 4 boundary. It reads 1,2,3 up to 4 bytes from the MEL segment
239 int num = 4 - (int)(intptr_t(melp->data) & 0x3);
240 for (int i = 0; i < num; ++i) { // this code is similar to mel_read
241 assert(melp->unstuff == false || melp->data[0] <= 0x8F);
242 ui64 d = (melp->size > 0) ? *melp->data : 0xFF;//if buffer is consumed
243 //set data to 0xFF
244 if (melp->size == 1) d |= 0xF; //if this is MEL+VLC-1, set LSBs to 0xF
245 // see the standard
246 melp->data += melp->size-- > 0; //increment if the end is not reached
247 int d_bits = 8 - melp->unstuff; //if unstuffing is needed, reduce by 1
248 melp->tmp = (melp->tmp << d_bits) | d; //store bits in tmp
249 melp->bits += d_bits; //increment tmp by number of bits
250 melp->unstuff = ((d & 0xFF) == 0xFF); //true of next byte needs
251 //unstuffing
252 }
253 melp->tmp <<= (64 - melp->bits); //push all the way up so the first bit
254 // is the MSB
255 }
256
257 //************************************************************************/
263 static inline
264 int mel_get_run(dec_mel_st *melp)
265 {
266 if (melp->num_runs == 0) //if no runs, decode more bit from MEL segment
267 mel_decode(melp);
268
269 int t = melp->runs & 0x7F; //retrieve one run
270 melp->runs >>= 7; // remove the retrieved run
271 melp->num_runs--;
272 return t; // return run
273 }
274
275
276 //************************************************************************/
302 template<int X>
303 static inline
304 ui32 destuff_frwd(const ui8* src, int size, ui8* dst, ui32 cap)
305 {
306 if (size < 0)
307 size = 0;
308 ui8* o = dst;
309 ui8* o_end = dst + cap;
310 const ui8* s = src;
311 const ui8* s_end = src + size;
312 ui64 acc = 0; // partial output byte, low nb bits are valid
313 ui32 nb = 0; // number of valid bits in acc; always < 8
314 bool prev_ff = false;
315
316 // fast path; 16 source bytes at a time when they contain no 0xFF
317 while (s + 16 <= s_end && o + 24 <= o_end)
318 {
319 __m128i v = _mm_loadu_si128((const __m128i*)s);
320 int ff = _mm_movemask_epi8(_mm_cmpeq_epi8(v, _mm_set1_epi8(-1)));
321 if (ff != 0 || prev_ff)
322 { // process these 16 bytes one at a time
323 for (int i = 0; i < 16; ++i) {
324 ui8 b = *s++;
325 acc |= (ui64)b << nb;
326 nb += prev_ff ? 7u : 8u;
327 prev_ff = (b == 0xFFu);
328 if (nb >= 8) { *o++ = (ui8)acc; acc >>= 8; nb -= 8; }
329 }
330 continue;
331 }
332 ui64 v0, v1;
333 memcpy(&v0, s, 8);
334 memcpy(&v1, s + 8, 8);
335 ui64 w0 = acc | (v0 << nb);
336 ui64 w1 = (v1 << nb) | (nb ? (v0 >> (64 - nb)) : 0);
337 memcpy(o, &w0, 8);
338 memcpy(o + 8, &w1, 8);
339 acc = nb ? (v1 >> (64 - nb)) : 0;
340 o += 16;
341 s += 16;
342 }
343 // tail; one byte at a time
344 while (s < s_end && o < o_end)
345 {
346 ui8 b = *s++;
347 acc |= (ui64)b << nb;
348 nb += prev_ff ? 7u : 8u;
349 prev_ff = (b == 0xFFu);
350 if (nb >= 8) { *o++ = (ui8)acc; acc >>= 8; nb -= 8; }
351 }
352 // fill the bits above nb with X, and pad with X bytes
353 ui32 fill = (X == 0xFF) ? (0xFFu << nb) : 0;
354 *o = (ui8)((ui32)acc | fill);
355 __m256i pad = _mm256_set1_epi8((char)X);
356 _mm256_storeu_si256((__m256i*)(o + 1), pad);
357 _mm256_storeu_si256((__m256i*)(o + 33), pad);
358 return (ui32)(o - dst) + 1;
359 }
360
361 //************************************************************************/
376 __m128i dfetch(const ui8* dbuf, ui32 limit, ui32 pos)
377 {
378 ui32 off = pos >> 3;
379 off = off < limit ? off : limit;
380 const ui8* p = dbuf + off;
381 __m128i v = _mm_loadu_si128((const __m128i*)p);
382 __m128i w = _mm_loadu_si128((const __m128i*)(p + 8));
383 int k = (int)(pos & 7);
384 __m128i r = _mm_srl_epi64(v, _mm_cvtsi32_si128(k));
385 __m128i c = _mm_sll_epi64(w, _mm_cvtsi32_si128(64 - k));
386 return _mm_or_si128(r, c);
387 }
388
389 //************************************************************************/
401 ui64 dfetch64(const ui8* dbuf, ui32 limit, ui32 pos)
402 {
403 ui32 off = pos >> 3;
404 off = off < limit ? off : limit;
405 ui64 v;
406 memcpy(&v, dbuf + off, 8);
407 return v >> (pos & 7);
408 }
409
410 //************************************************************************/
430 void drefill(ui64& val, ui32& bits, ui32& off,
431 const ui8* dbuf, ui32 limit)
432 {
433 ui64 v;
434 ui32 o = off < limit ? off : limit;
435 memcpy(&v, dbuf + o, 8);
436 val |= v << bits;
437 off += (63 - bits) >> 3;
438 bits |= 56;
439 }
440
441 //************************************************************************/
449 void dconsume(ui64& val, ui32& bits, ui32 num_bits)
450 {
451 val >>= num_bits;
452 bits -= num_bits;
453 }
454
455 //************************************************************************/
485 static inline
486 ui32 destuff_rev(const ui8* p, int size, bool unstuff,
487 ui64 acc, ui32 nb, ui8* dst, ui32 cap)
488 {
489 ui8* o = dst;
490 ui8* o_end = dst + cap;
491
492 // process the first byte with the caller-provided unstuff state;
493 // afterwards the state is implied by the byte at p[1], which the
494 // fast path checks vectorially (and which stays inside the segment)
495 if (size > 0 && o < o_end)
496 {
497 ui32 d = *p--;
498 --size;
499 acc |= (ui64)d << nb;
500 nb += 8 - ((unstuff && ((d & 0x7F) == 0x7F)) ? 1u : 0u);
501 unstuff = d > 0x8F;
502 if (nb >= 8) { *o++ = (ui8)acc; acc >>= 8; nb -= 8; }
503 }
504
505 // fast path; 16 source bytes at a time when none needs unstuffing
506 while (size >= 16 && o + 24 <= o_end)
507 {
508 __m128i v = _mm_loadu_si128((const __m128i*)(p - 15));
509 __m128i nx = _mm_loadu_si128((const __m128i*)(p - 14));
510 __m128i is7f = _mm_cmpeq_epi8(
511 _mm_and_si128(v, _mm_set1_epi8(0x7F)), _mm_set1_epi8(0x7F));
512 // le8f is 0xFF where the byte after (in memory) is <= 0x8F
513 __m128i le8f = _mm_cmpeq_epi8(
514 _mm_subs_epu8(nx, _mm_set1_epi8((char)0x8F)),
515 _mm_setzero_si128());
516 __m128i stuff = _mm_andnot_si128(le8f, is7f);
517 if (!_mm_testz_si128(stuff, stuff))
518 { // process these 16 bytes one at a time
519 for (int i = 0; i < 16; ++i) {
520 ui32 d = *p--;
521 acc |= (ui64)d << nb;
522 nb += 8 - ((unstuff && ((d & 0x7F) == 0x7F)) ? 1u : 0u);
523 unstuff = d > 0x8F;
524 if (nb >= 8) { *o++ = (ui8)acc; acc >>= 8; nb -= 8; }
525 }
526 size -= 16;
527 continue;
528 }
529 __m128i r = _mm_shuffle_epi8(v,
530 _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,
531 8, 9, 10, 11, 12, 13, 14, 15)); // reverse bytes
532#ifdef OJPH_ARCH_X86_64
533 ui64 v0 = (ui64)_mm_cvtsi128_si64(r);
534 ui64 v1 = (ui64)_mm_extract_epi64(r, 1);
535#else // 32-bit x86 lacks the 64-bit extract intrinsics
536 ui64 v0 = (ui32)_mm_cvtsi128_si32(r)
537 | ((ui64)(ui32)_mm_extract_epi32(r, 1) << 32);
538 ui64 v1 = (ui32)_mm_extract_epi32(r, 2)
539 | ((ui64)(ui32)_mm_extract_epi32(r, 3) << 32);
540#endif
541 ui64 w0 = acc | (v0 << nb);
542 ui64 w1 = (v1 << nb) | (nb ? (v0 >> (64 - nb)) : 0);
543 memcpy(o, &w0, 8);
544 memcpy(o + 8, &w1, 8);
545 acc = nb ? (v1 >> (64 - nb)) : 0;
546 o += 16;
547 p -= 16;
548 size -= 16;
549 unstuff = p[1] > 0x8F;
550 }
551 // tail; one byte at a time
552 while (size > 0 && o < o_end)
553 {
554 ui32 d = *p--;
555 --size;
556 acc |= (ui64)d << nb;
557 nb += 8 - ((unstuff && ((d & 0x7F) == 0x7F)) ? 1u : 0u);
558 unstuff = d > 0x8F;
559 if (nb >= 8) { *o++ = (ui8)acc; acc >>= 8; nb -= 8; }
560 }
561 // the bits above nb are already 0 = fill; pad with zero bytes
562 *o = (ui8)acc;
563 __m256i z = _mm256_setzero_si256();
564 _mm256_storeu_si256((__m256i*)(o + 1), z);
565 _mm256_storeu_si256((__m256i*)(o + 33), z);
566 return (ui32)(o - dst) + 1;
567 }
568
569 //************************************************************************/
585 static inline
586 ui32 destuff_vlc(const ui8* data, int lcup, int scup,
587 ui8* dst, ui32 cap)
588 {
589 const ui8* p = data + lcup - 2;
590 ui32 d = *p; // first byte, only the upper 4 bits are used
591 ui64 acc = d >> 4;
592 ui32 nb = 4 - ((acc & 7) == 7); // check standard
593 bool unstuff = (d | 0xF) > 0x8F;
594 return destuff_rev(p - 1, scup - 2, unstuff, acc, nb, dst, cap);
595 }
596
597 //************************************************************************/
612 static inline
613 ui32 destuff_mrp(const ui8* data, int lcup, int len2,
614 ui8* dst, ui32 cap)
615 {
616 return destuff_rev(data + lcup + len2 - 1, len2, true, 0, 0,
617 dst, cap);
618 }
619
620 //************************************************************************/
631 __m256i decode_two_quad32_avx2(__m256i inf_u_q, __m256i U_q,
632 const ui8* dbuf, ui32 limit, ui32& pos,
633 ui32 p, __m128i& vn) {
634 __m256i row = _mm256_setzero_si256();
635
636 // we keeps e_k, e_1, and rho in w2
637 __m256i flags = _mm256_and_si256(inf_u_q, _mm256_set_epi32(0x8880, 0x4440, 0x2220, 0x1110, 0x8880, 0x4440, 0x2220, 0x1110));
638 __m256i insig = _mm256_cmpeq_epi32(flags, _mm256_setzero_si256());
639
640 if ((uint32_t)_mm256_movemask_epi8(insig) != (uint32_t)0xFFFFFFFF) //are all insignificant?
641 {
642 flags = _mm256_mullo_epi16(flags, _mm256_set_epi16(1, 1, 2, 2, 4, 4, 8, 8, 1, 1, 2, 2, 4, 4, 8, 8));
643
644 // U_q holds U_q for this quad
645 // flags has e_k, e_1, and rho such that e_k is sitting in the
646 // 0x8000, e_1 in 0x800, and rho in 0x80
647
648 // next e_k and m_n
649 __m256i m_n;
650 __m256i w0 = _mm256_srli_epi32(flags, 15); // e_k
651 m_n = _mm256_sub_epi32(U_q, w0);
652 m_n = _mm256_andnot_si256(insig, m_n);
653
654 // find cumulative sums
655 // to find at which bit in ms_vec the sample starts
656 __m256i inc_sum = m_n; // inclusive scan
657 inc_sum = _mm256_add_epi32(inc_sum, _mm256_bslli_epi128(inc_sum, 4));
658 inc_sum = _mm256_add_epi32(inc_sum, _mm256_bslli_epi128(inc_sum, 8));
659 ui32 total_mn1 = (ui32)_mm256_extract_epi16(inc_sum, 6);
660 ui32 total_mn2 = (ui32)_mm256_extract_epi16(inc_sum, 14);
661
662 __m128i ms_vec0 = dfetch(dbuf, limit, pos);
663 __m128i ms_vec1 = dfetch(dbuf, limit, pos + total_mn1);
664 pos += total_mn1 + total_mn2;
665
666 __m256i ms_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ms_vec0), ms_vec1, 0x1);
667
668 __m256i ex_sum = _mm256_bslli_epi128(inc_sum, 4); // exclusive scan
669
670 // find the starting byte and starting bit
671 __m256i byte_idx = _mm256_srli_epi32(ex_sum, 3);
672 __m256i bit_idx = _mm256_and_si256(ex_sum, _mm256_set1_epi32(7));
673 byte_idx = _mm256_shuffle_epi8(byte_idx,
674 _mm256_set_epi32(0x0C0C0C0C, 0x08080808, 0x04040404, 0x00000000, 0x0C0C0C0C, 0x08080808, 0x04040404, 0x00000000));
675 byte_idx = _mm256_add_epi32(byte_idx, _mm256_set1_epi32(0x03020100));
676 __m256i d0 = _mm256_shuffle_epi8(ms_vec, byte_idx);
677 byte_idx = _mm256_add_epi32(byte_idx, _mm256_set1_epi32(0x01010101));
678 __m256i d1 = _mm256_shuffle_epi8(ms_vec, byte_idx);
679
680 // shift samples values to correct location
681 bit_idx = _mm256_or_si256(bit_idx, _mm256_slli_epi32(bit_idx, 16));
682
683 __m128i a = _mm_set_epi8(1, 3, 7, 15, 31, 63, 127, -1, 1, 3, 7, 15, 31, 63, 127, -1);
684 __m256i aa = _mm256_inserti128_si256(_mm256_castsi128_si256(a), a, 0x1);
685
686 __m256i bit_shift = _mm256_shuffle_epi8(aa, bit_idx);
687 bit_shift = _mm256_add_epi16(bit_shift, _mm256_set1_epi16(0x0101));
688 d0 = _mm256_mullo_epi16(d0, bit_shift);
689 d0 = _mm256_srli_epi16(d0, 8); // we should have 8 bits in the LSB
690 d1 = _mm256_mullo_epi16(d1, bit_shift);
691 d1 = _mm256_and_si256(d1, _mm256_set1_epi32((si32)0xFF00FF00)); // 8 in MSB
692 d0 = _mm256_or_si256(d0, d1);
693
694 // find location of e_k and mask
695 __m256i shift;
696 __m256i ones = _mm256_set1_epi32(1);
697 __m256i twos = _mm256_set1_epi32(2);
698 __m256i U_q_m1 = _mm256_sub_epi32(U_q, ones);
699 U_q_m1 = _mm256_and_si256(U_q_m1, _mm256_set_epi32(0, 0, 0, 0x1F, 0, 0, 0, 0x1F));
700 U_q_m1 = _mm256_shuffle_epi32(U_q_m1, 0);
701 w0 = _mm256_sub_epi32(twos, w0);
702 shift = _mm256_sllv_epi32(w0, U_q_m1); // U_q_m1 must be no more than 31
703 ms_vec = _mm256_and_si256(d0, _mm256_sub_epi32(shift, ones));
704
705 // next e_1
706 w0 = _mm256_and_si256(flags, _mm256_set1_epi32(0x800));
707 w0 = _mm256_cmpeq_epi32(w0, _mm256_setzero_si256());
708 w0 = _mm256_andnot_si256(w0, shift); // e_1 in correct position
709 ms_vec = _mm256_or_si256(ms_vec, w0); // e_1
710 w0 = _mm256_slli_epi32(ms_vec, 31); // sign
711 ms_vec = _mm256_or_si256(ms_vec, ones); // bin center
712 __m256i tvn = ms_vec;
713 ms_vec = _mm256_add_epi32(ms_vec, twos);// + 2
714 ms_vec = _mm256_slli_epi32(ms_vec, (si32)p - 1);
715 ms_vec = _mm256_or_si256(ms_vec, w0); // sign
716 row = _mm256_andnot_si256(insig, ms_vec); // significant only
717
718 ms_vec = _mm256_andnot_si256(insig, tvn); // significant only
719
720 tvn = _mm256_shuffle_epi8(ms_vec, _mm256_set_epi32(-1, 0x0F0E0D0C, 0x07060504, -1, -1, -1, 0x0F0E0D0C, 0x07060504));
721
722 vn = _mm_or_si128(vn, _mm256_castsi256_si128(tvn));
723 vn = _mm_or_si128(vn, _mm256_extracti128_si256(tvn, 0x1));
724 }
725 return row;
726 }
727
728
729 //************************************************************************/
739
741 __m256i decode_four_quad16(const __m128i inf_u_q, __m128i U_q,
742 const ui8* dbuf, ui32 limit, ui32& pos,
743 ui32 p, __m128i& vn) {
744
745 __m256i w0; // workers
746 __m256i insig; // lanes hold FF's if samples are insignificant
747 __m256i flags; // lanes hold e_k, e_1, and rho
748
749 __m256i row = _mm256_setzero_si256();
750 __m128i ddd = _mm_shuffle_epi8(inf_u_q,
751 _mm_set_epi16(0x0d0c, 0x0d0c, 0x0908, 0x908, 0x0504, 0x0504, 0x0100, 0x0100));
752 w0 = _mm256_permutevar8x32_epi32(_mm256_castsi128_si256(ddd),
753 _mm256_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3));
754 // we keeps e_k, e_1, and rho in w2
755 flags = _mm256_and_si256(w0,
756 _mm256_set_epi16((si16)0x8880, 0x4440, 0x2220, 0x1110,
757 (si16)0x8880, 0x4440, 0x2220, 0x1110,
758 (si16)0x8880, 0x4440, 0x2220, 0x1110,
759 (si16)0x8880, 0x4440, 0x2220, 0x1110));
760 insig = _mm256_cmpeq_epi16(flags, _mm256_setzero_si256());
761 if ((uint32_t)_mm256_movemask_epi8(insig) != (uint32_t)0xFFFFFFFF) //are all insignificant?
762 {
763 ddd = _mm_or_si128(_mm_bslli_si128(U_q, 2), U_q);
764 __m256i U_q_avx = _mm256_permutevar8x32_epi32(_mm256_castsi128_si256(ddd),
765 _mm256_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3));
766 flags = _mm256_mullo_epi16(flags, _mm256_set_epi16(1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8));
767
768 // U_q holds U_q for this quad
769 // flags has e_k, e_1, and rho such that e_k is sitting in the
770 // 0x8000, e_1 in 0x800, and rho in 0x80
771
772 // next e_k and m_n
773 __m256i m_n;
774 w0 = _mm256_srli_epi16(flags, 15); // e_k
775 m_n = _mm256_sub_epi16(U_q_avx, w0);
776 m_n = _mm256_andnot_si256(insig, m_n);
777
778 // find cumulative sums
779 // to find at which bit in ms_vec the sample starts
780 __m256i inc_sum = m_n; // inclusive scan
781 inc_sum = _mm256_add_epi16(inc_sum, _mm256_bslli_epi128(inc_sum, 2));
782 inc_sum = _mm256_add_epi16(inc_sum, _mm256_bslli_epi128(inc_sum, 4));
783 inc_sum = _mm256_add_epi16(inc_sum, _mm256_bslli_epi128(inc_sum, 8));
784 ui32 total_mn1 = (ui32)_mm256_extract_epi16(inc_sum, 7);
785 ui32 total_mn2 = (ui32)_mm256_extract_epi16(inc_sum, 15);
786 __m256i ex_sum = _mm256_bslli_epi128(inc_sum, 2); // exclusive scan
787
788 __m128i ms_vec0 = dfetch(dbuf, limit, pos);
789 __m128i ms_vec1 = dfetch(dbuf, limit, pos + total_mn1);
790 pos += total_mn1 + total_mn2;
791
792 __m256i ms_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ms_vec0), ms_vec1, 0x1);
793
794 // find the starting byte and starting bit
795 __m256i byte_idx = _mm256_srli_epi16(ex_sum, 3);
796 __m256i bit_idx = _mm256_and_si256(ex_sum, _mm256_set1_epi16(7));
797 byte_idx = _mm256_shuffle_epi8(byte_idx,
798 _mm256_set_epi16(0x0E0E, 0x0C0C, 0x0A0A, 0x0808,
799 0x0606, 0x0404, 0x0202, 0x0000, 0x0E0E, 0x0C0C, 0x0A0A, 0x0808,
800 0x0606, 0x0404, 0x0202, 0x0000));
801 byte_idx = _mm256_add_epi16(byte_idx, _mm256_set1_epi16(0x0100));
802 __m256i d0 = _mm256_shuffle_epi8(ms_vec, byte_idx);
803 byte_idx = _mm256_add_epi16(byte_idx, _mm256_set1_epi16(0x0101));
804 __m256i d1 = _mm256_shuffle_epi8(ms_vec, byte_idx);
805
806 // shift samples values to correct location
807 __m256i bit_shift = _mm256_shuffle_epi8(
808 _mm256_set_epi8(1, 3, 7, 15, 31, 63, 127, -1,
809 1, 3, 7, 15, 31, 63, 127, -1, 1, 3, 7, 15, 31, 63, 127, -1,
810 1, 3, 7, 15, 31, 63, 127, -1), bit_idx);
811 bit_shift = _mm256_add_epi16(bit_shift, _mm256_set1_epi16(0x0101));
812 d0 = _mm256_mullo_epi16(d0, bit_shift);
813 d0 = _mm256_srli_epi16(d0, 8); // we should have 8 bits in the LSB
814 d1 = _mm256_mullo_epi16(d1, bit_shift);
815 d1 = _mm256_and_si256(d1, _mm256_set1_epi16((si16)0xFF00)); // 8 in MSB
816 d0 = _mm256_or_si256(d0, d1);
817
818 // find location of e_k and mask
819 __m256i shift;
820 __m256i ones = _mm256_set1_epi16(1);
821 __m256i twos = _mm256_set1_epi16(2);
822 // shift = (2 - e_k) << (U_q - 1); AVX2 has no _mm256_sllv_epi16,
823 // so the variable shift is emulated with a pshufb power-of-two
824 // lookup and a 16-bit multiply. U_q - 1 <= 14 in this path;
825 // for U_q == 0 the lookup indices have their MSB set, so pshufb
826 // returns 0, the same shift value the former uniform 16-bit
827 // shift emulation produced (it shifted by 31)
828 __m256i kq = _mm256_sub_epi16(U_q_avx, ones);
829 __m256i idx = _mm256_or_si256(kq,
830 _mm256_slli_epi16(_mm256_sub_epi16(kq,
831 _mm256_set1_epi16(8)), 8));
832 const __m256i pow2_tbl = _mm256_setr_epi8(
833 1, 2, 4, 8, 16, 32, 64, (char)128, 0, 0, 0, 0, 0, 0, 0, 0,
834 1, 2, 4, 8, 16, 32, 64, (char)128, 0, 0, 0, 0, 0, 0, 0, 0);
835 __m256i pow2 = _mm256_shuffle_epi8(pow2_tbl, idx);
836 w0 = _mm256_sub_epi16(twos, w0);
837 shift = _mm256_mullo_epi16(w0, pow2);
838 ms_vec = _mm256_and_si256(d0, _mm256_sub_epi16(shift, ones));
839
840 // next e_1
841 w0 = _mm256_and_si256(flags, _mm256_set1_epi16(0x800));
842 w0 = _mm256_cmpeq_epi16(w0, _mm256_setzero_si256());
843 w0 = _mm256_andnot_si256(w0, shift); // e_1 in correct position
844 ms_vec = _mm256_or_si256(ms_vec, w0); // e_1
845 w0 = _mm256_slli_epi16(ms_vec, 15); // sign
846 ms_vec = _mm256_or_si256(ms_vec, ones); // bin center
847 __m256i tvn = ms_vec;
848 ms_vec = _mm256_add_epi16(ms_vec, twos);// + 2
849 ms_vec = _mm256_slli_epi16(ms_vec, (si32)p - 1);
850 ms_vec = _mm256_or_si256(ms_vec, w0); // sign
851 row = _mm256_andnot_si256(insig, ms_vec); // significant only
852
853 ms_vec = _mm256_andnot_si256(insig, tvn); // significant only
854
855 __m256i ms_vec_shuffle1 = _mm256_shuffle_epi8(ms_vec,
856 _mm256_set_epi16(-1, -1, -1, -1, 0x0706, 0x0302, -1, -1,
857 -1, -1, -1, -1, -1, -1, 0x0706, 0x0302));
858 __m256i ms_vec_shuffle2 = _mm256_shuffle_epi8(ms_vec,
859 _mm256_set_epi16(-1, -1, -1, 0x0F0E, 0x0B0A, -1, -1, -1,
860 -1, -1, -1, -1, -1, 0x0F0E, 0x0B0A, -1));
861 ms_vec = _mm256_or_si256(ms_vec_shuffle1, ms_vec_shuffle2);
862
863 vn = _mm_or_si128(vn, _mm256_castsi256_si128(ms_vec));
864 vn = _mm_or_si128(vn, _mm256_extracti128_si256(ms_vec, 0x1));
865 }
866 return row;
867 }
868
869 // https://stackoverflow.com/a/58827596
870 inline __m256i avx2_lzcnt_epi32(__m256i v) {
871 // prevent value from being rounded up to the next power of two
872 v = _mm256_andnot_si256(_mm256_srli_epi32(v, 8), v); // keep 8 MSB
873
874 v = _mm256_castps_si256(_mm256_cvtepi32_ps(v)); // convert an integer to float
875 v = _mm256_srli_epi32(v, 23); // shift down the exponent
876 v = _mm256_subs_epu16(_mm256_set1_epi32(158), v); // undo bias
877 v = _mm256_min_epi16(v, _mm256_set1_epi32(32)); // clamp at 32
878
879 return v;
880 }
881
882 //************************************************************************/
899 //************************************************************************/
909 bool decode_cb_step2_16bit(ui16* scratch, ui32* decoded_data,
910 ui8* coded_data, ui32 width, ui32 height,
911 ui32 stride, ui32 sstr, ui32 p, ui32 mmsbp2,
912 int lcup, int scup)
913 {
914 // reduce bitplane by 16 because we now have 16 bits instead of 32
915 p -= 16;
916
917 const int v_n_size = 512 + 16;
918#ifdef __MINGW64__
919 ui16 v_n_scratch[v_n_size] = {0};
920 ui32 v_n_scratch_32[v_n_size] = {0};
921#else
922 ui16 v_n_scratch[v_n_size];
923 memset(v_n_scratch + (width >> 1) + 4, 0, 8 * sizeof(ui16));
924 ui32 v_n_scratch_32[v_n_size];
925#endif
926
927 // maximum consumable MagSgn bits: 4096 samples x (mmsbp2 < 16) bits
928 const ui32 dbuf_cap = 4096 * 15 / 8;
929 ui8 dbuf[dbuf_cap + 72];
930 ui32 limit = destuff_frwd<0xFF>(coded_data, lcup - scup, dbuf, dbuf_cap);
931 ui32 pos = 0;
932
933 {
934 ui16 *sp = scratch;
935 ui16 *vp = v_n_scratch;
936 ui32 *dp = decoded_data;
937 vp[0] = 2; // for easy calculation of emax
938
939 for (ui32 x = 0; x < width; x += 8, sp += 8, vp += 4, dp += 8) {
941 __m128i inf_u_q = _mm_loadu_si128((__m128i*)sp);
942 __m128i U_q = _mm_srli_epi32(inf_u_q, 16);
943 __m128i w = _mm_cmpgt_epi32(U_q, _mm_set1_epi32((int)mmsbp2));
944 if (!_mm_testz_si128(w, w)) {
945 return false;
946 }
947
948 __m128i vn = _mm_set1_epi16(2);
949 __m256i row = decode_four_quad16(inf_u_q, U_q, dbuf, limit, pos, p, vn);
950
951 w = _mm_cvtsi32_si128(*(unsigned short const*)(vp));
952 _mm_storeu_si128((__m128i*)vp, _mm_or_si128(w, vn));
953
954 __m256i w0 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1, 0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1));
955 __m256i w1 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1, 0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1));
956
957 _mm256_storeu_si256((__m256i*)dp, w0);
958 _mm256_storeu_si256((__m256i*)(dp + stride), w1);
959 }
960 }
961
962 for (ui32 y = 2; y < height; y += 2) {
963 {
964 // perform 15 - count_leading_zeros(*vp) here
965 ui16 *vp = v_n_scratch;
966 ui32 *vp_32 = v_n_scratch_32;
967
968 ui16* sp = scratch + (y >> 1) * sstr;
969 const __m256i avx_mmsbp2 = _mm256_set1_epi32((int)mmsbp2);
970 const __m256i avx_31 = _mm256_set1_epi32(31);
971 const __m256i avx_f0 = _mm256_set1_epi32(0xF0);
972 const __m256i avx_1 = _mm256_set1_epi32(1);
973 const __m256i avx_0 = _mm256_setzero_si256();
974
975 for (ui32 x = 0; x <= width; x += 16, vp += 8, sp += 16, vp_32 += 8) {
976 __m128i v = _mm_loadu_si128((__m128i*)vp);
977 __m128i v_p1 = _mm_loadu_si128((__m128i*)(vp + 1));
978 v = _mm_or_si128(v, v_p1);
979
980 __m256i v_avx = _mm256_cvtepu16_epi32(v);
981 v_avx = avx2_lzcnt_epi32(v_avx);
982 v_avx = _mm256_sub_epi32(avx_31, v_avx);
983
984 __m256i inf_u_q = _mm256_loadu_si256((__m256i*)sp);
985 __m256i gamma = _mm256_and_si256(inf_u_q, avx_f0);
986 __m256i w0 = _mm256_sub_epi32(gamma, avx_1);
987 gamma = _mm256_and_si256(gamma, w0);
988 gamma = _mm256_cmpeq_epi32(gamma, avx_0);
989
990 v_avx = _mm256_andnot_si256(gamma, v_avx);
991 v_avx = _mm256_max_epi32(v_avx, avx_1);
992
993 inf_u_q = _mm256_srli_epi32(inf_u_q, 16);
994 v_avx = _mm256_add_epi32(inf_u_q, v_avx);
995
996 w0 = _mm256_cmpgt_epi32(v_avx, avx_mmsbp2);
997 if (!_mm256_testz_si256(w0, w0)) {
998 return false;
999 }
1000
1001 _mm256_storeu_si256((__m256i*)vp_32, v_avx);
1002 }
1003 }
1004
1005 ui16 *vp = v_n_scratch;
1006 ui32* vp_32 = v_n_scratch_32;
1007 ui16 *sp = scratch + (y >> 1) * sstr;
1008 ui32 *dp = decoded_data + y * stride;
1009 vp[0] = 2; // for easy calculation of emax
1010
1011 for (ui32 x = 0; x < width; x += 8, sp += 8, vp += 4, dp += 8, vp_32 += 4) {
1013 __m128i inf_u_q = _mm_loadu_si128((__m128i*)sp);
1014 __m128i U_q = _mm_loadu_si128((__m128i*)vp_32);
1015
1016 __m128i vn = _mm_set1_epi16(2);
1017 __m256i row = decode_four_quad16(inf_u_q, U_q, dbuf, limit, pos, p, vn);
1018
1019 __m128i w = _mm_cvtsi32_si128(*(unsigned short const*)(vp));
1020 _mm_storeu_si128((__m128i*)vp, _mm_or_si128(w, vn));
1021
1022 __m256i w0 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1, 0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1));
1023 __m256i w1 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1, 0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1));
1024
1025 _mm256_storeu_si256((__m256i*)dp, w0);
1026 _mm256_storeu_si256((__m256i*)(dp + stride), w1);
1027 }
1028 }
1029 return true;
1030 }
1031
1032 //************************************************************************/
1041 bool decode_cb_step2_32bit(ui16* scratch, ui32* decoded_data,
1042 ui8* coded_data, ui32 width, ui32 height,
1043 ui32 stride, ui32 sstr, ui32 p, ui32 mmsbp2,
1044 int lcup, int scup)
1045 {
1046 const int v_n_size = 512 + 16;
1047#ifdef __MINGW64__
1048 ui32 v_n_scratch[2 * v_n_size] = {0};
1049#else
1050 ui32 v_n_scratch[2 * v_n_size];
1051 memset(v_n_scratch + (width >> 1) + 2, 0, 14 * sizeof(ui32));
1052#endif
1053
1054 // maximum consumable MagSgn bits: 4096 samples x (mmsbp2 <= 32) bits
1055 const ui32 dbuf_cap = 4096 * 32 / 8;
1056 ui8 dbuf[dbuf_cap + 72];
1057 ui32 limit = destuff_frwd<0xFF>(coded_data, lcup - scup, dbuf, dbuf_cap);
1058 ui32 pos = 0;
1059
1060 const __m256i avx_mmsbp2 = _mm256_set1_epi32((int)mmsbp2);
1061
1062 {
1063 ui16 *sp = scratch;
1064 ui32 *vp = v_n_scratch;
1065 ui32 *dp = decoded_data;
1066 vp[0] = 2; // for easy calculation of emax
1067
1068 for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
1069 {
1070 __m128i vn = _mm_set1_epi32(2);
1071
1072 __m256i inf_u_q = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)sp));
1073 inf_u_q = _mm256_permutevar8x32_epi32(inf_u_q, _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1));
1074
1075 __m256i U_q = _mm256_srli_epi32(inf_u_q, 16);
1076 __m256i w = _mm256_cmpgt_epi32(U_q, avx_mmsbp2);
1077 if (!_mm256_testz_si256(w, w)) {
1078 return false;
1079 }
1080
1081 __m256i row = decode_two_quad32_avx2(inf_u_q, U_q, dbuf, limit, pos, p, vn);
1082 row = _mm256_permutevar8x32_epi32(row, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
1083 _mm_store_si128((__m128i*)dp, _mm256_castsi256_si128(row));
1084 _mm_store_si128((__m128i*)(dp + stride), _mm256_extracti128_si256(row, 0x1));
1085
1086 __m128i w0 = _mm_cvtsi32_si128(*(int const*)vp);
1087 w0 = _mm_or_si128(w0, vn);
1088 _mm_storeu_si128((__m128i*)vp, w0);
1089 }
1090 }
1091
1092 for (ui32 y = 2; y < height; y += 2)
1093 {
1094 {
1095 // perform 31 - count_leading_zeros(*vp) here
1096 ui32 *vp = v_n_scratch;
1097 ui16* sp = scratch + (y >> 1) * sstr;
1098
1099 const __m256i avx_31 = _mm256_set1_epi32(31);
1100 const __m256i avx_f0 = _mm256_set1_epi32(0xF0);
1101 const __m256i avx_1 = _mm256_set1_epi32(1);
1102 const __m256i avx_0 = _mm256_setzero_si256();
1103
1104 for (ui32 x = 0; x <= width; x += 16, vp += 8, sp += 16) {
1105 __m256i v = _mm256_loadu_si256((__m256i*)vp);
1106 __m256i v_p1 = _mm256_loadu_si256((__m256i*)(vp + 1));
1107 v = _mm256_or_si256(v, v_p1);
1108 v = avx2_lzcnt_epi32(v);
1109 v = _mm256_sub_epi32(avx_31, v);
1110
1111 __m256i inf_u_q = _mm256_loadu_si256((__m256i*)sp);
1112 __m256i gamma = _mm256_and_si256(inf_u_q, avx_f0);
1113 __m256i w0 = _mm256_sub_epi32(gamma, avx_1);
1114 gamma = _mm256_and_si256(gamma, w0);
1115 gamma = _mm256_cmpeq_epi32(gamma, avx_0);
1116
1117 v = _mm256_andnot_si256(gamma, v);
1118 v = _mm256_max_epi32(v, avx_1);
1119
1120 inf_u_q = _mm256_srli_epi32(inf_u_q, 16);
1121 v = _mm256_add_epi32(inf_u_q, v);
1122
1123 w0 = _mm256_cmpgt_epi32(v, avx_mmsbp2);
1124 if (!_mm256_testz_si256(w0, w0)) {
1125 return false;
1126 }
1127
1128 _mm256_storeu_si256((__m256i*)(vp + v_n_size), v);
1129 }
1130 }
1131
1132 ui32 *vp = v_n_scratch;
1133 ui16 *sp = scratch + (y >> 1) * sstr;
1134 ui32 *dp = decoded_data + y * stride;
1135 vp[0] = 2; // for easy calculation of emax
1136
1137 for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4) {
1138 //process two quads
1139 __m128i vn = _mm_set1_epi32(2);
1140
1141 __m256i inf_u_q = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)sp));
1142 inf_u_q = _mm256_permutevar8x32_epi32(inf_u_q, _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1));
1143
1144 __m256i U_q = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)(vp + v_n_size)));
1145 U_q = _mm256_permutevar8x32_epi32(U_q, _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1));
1146
1147 __m256i row = decode_two_quad32_avx2(inf_u_q, U_q, dbuf, limit, pos, p, vn);
1148 row = _mm256_permutevar8x32_epi32(row, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
1149 _mm_store_si128((__m128i*)dp, _mm256_castsi256_si128(row));
1150 _mm_store_si128((__m128i*)(dp + stride), _mm256_extracti128_si256(row, 0x1));
1151
1152 __m128i w0 = _mm_cvtsi32_si128(*(int const*)vp);
1153 w0 = _mm_or_si128(w0, vn);
1154 _mm_storeu_si128((__m128i*)vp, w0);
1155 }
1156 }
1157 return true;
1158 }
1159
1160 //************************************************************************/
1168 void decode_cb_spp_mrp(ui16* scratch, ui32* decoded_data, ui8* coded_data,
1169 ui32 width, ui32 height, ui32 stride, ui32 sstr,
1170 ui32 p, ui32 num_passes, ui32 lengths1,
1171 ui32 lengths2, bool stripe_causal)
1172 {
1173 // We use scratch again, we can divide it into multiple regions
1174 // sigma holds all the significant samples, and it cannot
1175 // be modified after it is set. it will be used during the
1176 // Magnitude Refinement Pass
1177 ui16* const sigma = scratch;
1178
1179 ui32 mstr = (width + 3u) >> 2; // divide by 4, since each
1180 // ui16 contains 4 columns
1181 mstr = ((mstr + 2u) + 7u) & ~7u; // multiples of 8
1182
1183 // We re-arrange quad significance, where each 4 consecutive
1184 // bits represent one quad, into column significance, where,
1185 // each 4 consequtive bits represent one column of 4 rows
1186 {
1187 ui32 y;
1188
1189 const __m128i mask_3 = _mm_set1_epi32(0x30);
1190 const __m128i mask_C = _mm_set1_epi32(0xC0);
1191 const __m128i shuffle_mask = _mm_set_epi32(-1, -1, -1, 0x0C080400);
1192 for (y = 0; y < height; y += 4)
1193 {
1194 ui16* sp = scratch + (y >> 1) * sstr;
1195 ui16* dp = sigma + (y >> 2) * mstr;
1196 for (ui32 x = 0; x < width; x += 8, sp += 8, dp += 2)
1197 {
1198 __m128i s0, s1, u3, uC, t0, t1;
1199
1200 s0 = _mm_loadu_si128((__m128i*)(sp));
1201 u3 = _mm_and_si128(s0, mask_3);
1202 u3 = _mm_srli_epi32(u3, 4);
1203 uC = _mm_and_si128(s0, mask_C);
1204 uC = _mm_srli_epi32(uC, 2);
1205 t0 = _mm_or_si128(u3, uC);
1206
1207 s1 = _mm_loadu_si128((__m128i*)(sp + sstr));
1208 u3 = _mm_and_si128(s1, mask_3);
1209 u3 = _mm_srli_epi32(u3, 2);
1210 uC = _mm_and_si128(s1, mask_C);
1211 t1 = _mm_or_si128(u3, uC);
1212
1213 __m128i r = _mm_or_si128(t0, t1);
1214 r = _mm_shuffle_epi8(r, shuffle_mask);
1215
1216 ui32 t = (ui32)_mm_extract_epi32(r, 0);
1217 memcpy(dp, &t, 4);
1218 }
1219 dp[0] = 0; // set an extra entry on the right with 0
1220 }
1221 {
1222 // reset one row after the codeblock
1223 ui16* dp = sigma + (y >> 2) * mstr;
1224 __m128i zero = _mm_setzero_si128();
1225 for (ui32 x = 0; x < width; x += 32, dp += 8)
1226 _mm_storeu_si128((__m128i*)dp, zero);
1227 dp[0] = 0; // set an extra entry on the right with 0
1228 }
1229 }
1230
1231 // We perform Significance Propagation Pass here
1232 {
1233 // This stores significance information of the previous
1234 // 4 rows. Significance information in this array includes
1235 // all signicant samples in bitplane p - 1; that is,
1236 // significant samples for bitplane p (discovered during the
1237 // cleanup pass and stored in sigma) and samples that have recently
1238 // became significant (during the SPP) in bitplane p-1.
1239 // We store enough for the widest row, containing 1024 columns,
1240 // which is equivalent to 256 of ui16, since each stores 4 columns.
1241 // We add an extra 8 entries, just in case we need more
1242 ui16 prev_row_sig[256 + 8] = {0}; // 528 Bytes
1243
1244 // maximum consumable SPP bits: 4096 samples x 2 bits (one
1245 // significance bit and one sign bit per sample)
1246 const ui32 spp_cap = 4096 * 2 / 8;
1247 ui8 spp_buf[spp_cap + 72];
1248 ui32 spp_limit = destuff_frwd<0>(coded_data + lengths1,
1249 (int)lengths2, spp_buf, spp_cap);
1250 ui32 spp_pos = 0;
1251
1252 for (ui32 y = 0; y < height; y += 4)
1253 {
1254 ui32 pattern = 0xFFFFu; // a pattern needed samples
1255 if (height - y < 4) {
1256 pattern = 0x7777u;
1257 if (height - y < 3) {
1258 pattern = 0x3333u;
1259 if (height - y < 2)
1260 pattern = 0x1111u;
1261 }
1262 }
1263
1264 // prev holds sign. info. for the previous quad, together
1265 // with the rows on top of it and below it.
1266 ui32 prev = 0;
1267 ui16 *prev_sig = prev_row_sig;
1268 ui16 *cur_sig = sigma + (y >> 2) * mstr;
1269 ui32 *dpp = decoded_data + y * stride;
1270 for (ui32 x = 0; x < width; x += 4, dpp += 4, ++cur_sig, ++prev_sig)
1271 {
1272 // only rows and columns inside the stripe are included
1273 si32 s = (si32)x + 4 - (si32)width;
1274 s = ojph_max(s, 0);
1275 pattern = pattern >> (s * 4);
1276
1277 // We first find locations that need to be tested (potential
1278 // SPP members); these location will end up in mbr
1279 // In each iteration, we produce 16 bits because cwd can have
1280 // up to 16 bits of significance information, followed by the
1281 // corresponding 16 bits of sign information; therefore, it is
1282 // sufficient to fetch 32 bit data per loop.
1283
1284 // Althougth we are interested in 16 bits only, we load 32 bits.
1285 // For the 16 bits we are producing, we need the next 4 bits --
1286 // We need data for at least 5 columns out of 8.
1287 // Therefore loading 32 bits is easier than loading 16 bits
1288 // twice.
1289 ui32 ps, ns, cs;
1290 memcpy(&ps, prev_sig, 4);
1291 memcpy(&ns, cur_sig + mstr, 4);
1292 ui32 u = (ps & 0x88888888) >> 3; // the row on top
1293 if (!stripe_causal)
1294 u |= (ns & 0x11111111) << 3; // the row below
1295
1296 memcpy(&cs, cur_sig, 4);
1297 // vertical integration
1298 ui32 mbr = cs; // this sig. info.
1299 mbr |= (cs & 0x77777777) << 1; //above neighbors
1300 mbr |= (cs & 0xEEEEEEEE) >> 1; //below neighbors
1301 mbr |= u;
1302 // horizontal integration
1303 ui32 t = mbr;
1304 mbr |= t << 4; // neighbors on the left
1305 mbr |= t >> 4; // neighbors on the right
1306 mbr |= prev >> 12; // significance of previous group
1307
1308 // remove outside samples, and already significant samples
1309 mbr &= pattern;
1310 mbr &= ~cs;
1311
1312 // find samples that become significant during the SPP
1313 ui32 new_sig = mbr;
1314 if (new_sig)
1315 {
1316 ui64 cwd = dfetch64(spp_buf, spp_limit, spp_pos);
1317
1318 ui32 cnt = 0;
1319 ui32 col_mask = 0xFu;
1320 ui32 inv_sig = ~cs & pattern;
1321 for (int i = 0; i < 16; i += 4, col_mask <<= 4)
1322 {
1323 if ((col_mask & new_sig) == 0)
1324 continue;
1325
1326 //scan one column
1327 ui32 sample_mask = 0x1111u & col_mask;
1328 if (new_sig & sample_mask)
1329 {
1330 new_sig &= ~sample_mask;
1331 if (cwd & 1)
1332 {
1333 ui32 t = 0x33u << i;
1334 new_sig |= t & inv_sig;
1335 }
1336 cwd >>= 1; ++cnt;
1337 }
1338
1339 sample_mask <<= 1;
1340 if (new_sig & sample_mask)
1341 {
1342 new_sig &= ~sample_mask;
1343 if (cwd & 1)
1344 {
1345 ui32 t = 0x76u << i;
1346 new_sig |= t & inv_sig;
1347 }
1348 cwd >>= 1; ++cnt;
1349 }
1350
1351 sample_mask <<= 1;
1352 if (new_sig & sample_mask)
1353 {
1354 new_sig &= ~sample_mask;
1355 if (cwd & 1)
1356 {
1357 ui32 t = 0xECu << i;
1358 new_sig |= t & inv_sig;
1359 }
1360 cwd >>= 1; ++cnt;
1361 }
1362
1363 sample_mask <<= 1;
1364 if (new_sig & sample_mask)
1365 {
1366 new_sig &= ~sample_mask;
1367 if (cwd & 1)
1368 {
1369 ui32 t = 0xC8u << i;
1370 new_sig |= t & inv_sig;
1371 }
1372 cwd >>= 1; ++cnt;
1373 }
1374 }
1375
1376 if (new_sig)
1377 {
1378 // the sign bits sit right after the cnt consumed bits
1379 // of cwd; no reassembly is needed
1380
1381 // Spread new_sig, such that each bit is in one byte with a
1382 // value of 0 if new_sig bit is 0, and 0xFF if new_sig is 1
1383 __m128i new_sig_vec = _mm_set1_epi16((si16)new_sig);
1384 new_sig_vec = _mm_shuffle_epi8(new_sig_vec,
1385 _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
1386 new_sig_vec = _mm_and_si128(new_sig_vec,
1387 _mm_set1_epi64x((si64)0x8040201008040201));
1388 new_sig_vec = _mm_cmpeq_epi8(new_sig_vec,
1389 _mm_set1_epi64x((si64)0x8040201008040201));
1390
1391 // find cumulative sums
1392 // to find which bit in cwd we should extract
1393 __m128i inc_sum = new_sig_vec; // inclusive scan
1394 inc_sum = _mm_abs_epi8(inc_sum); // cvrt to 0 or 1
1395 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 1));
1396 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 2));
1397 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 4));
1398 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 8));
1399 cnt += (ui32)_mm_extract_epi16(inc_sum, 7) >> 8;
1400 // exclusive scan
1401 __m128i ex_sum = _mm_bslli_si128(inc_sum, 1);
1402
1403 // Spread cwd, such that each bit is in one byte
1404 // with a value of 0 or 1.
1405 __m128i cwd_vec = _mm_set1_epi16((si16)cwd);
1406 cwd_vec = _mm_shuffle_epi8(cwd_vec,
1407 _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
1408 cwd_vec = _mm_and_si128(cwd_vec,
1409 _mm_set1_epi64x((si64)0x8040201008040201));
1410 cwd_vec = _mm_cmpeq_epi8(cwd_vec,
1411 _mm_set1_epi64x((si64)0x8040201008040201));
1412 cwd_vec = _mm_abs_epi8(cwd_vec);
1413
1414 // Obtain bit from cwd_vec correspondig to ex_sum
1415 // Basically, collect needed bits from cwd_vec
1416 __m128i v = _mm_shuffle_epi8(cwd_vec, ex_sum);
1417
1418 // load data and set spp coefficients
1419 __m128i m =
1420 _mm_set_epi8(-1,-1,-1,12,-1,-1,-1,8,-1,-1,-1,4,-1,-1,-1,0);
1421 __m128i val = _mm_set1_epi32(3 << (p - 2));
1422 ui32 *dp = dpp;
1423 for (int c = 0; c < 4; ++ c) {
1424 __m128i s0, s0_ns, s0_val;
1425 // load coefficients
1426 s0 = _mm_load_si128((__m128i*)dp);
1427
1428 // epi32 is -1 only for coefficient that
1429 // are changed during the SPP
1430 s0_ns = _mm_shuffle_epi8(new_sig_vec, m);
1431 s0_ns = _mm_cmpeq_epi32(s0_ns, _mm_set1_epi32(0xFF));
1432
1433 // obtain sign for coefficients in SPP
1434 s0_val = _mm_shuffle_epi8(v, m);
1435 s0_val = _mm_slli_epi32(s0_val, 31);
1436 s0_val = _mm_or_si128(s0_val, val);
1437 s0_val = _mm_and_si128(s0_val, s0_ns);
1438
1439 // update vector
1440 s0 = _mm_or_si128(s0, s0_val);
1441 // store coefficients
1442 _mm_store_si128((__m128i*)dp, s0);
1443 // prepare for next row
1444 dp += stride;
1445 m = _mm_add_epi32(m, _mm_set1_epi32(1));
1446 }
1447 }
1448 spp_pos += cnt;
1449 }
1450
1451 new_sig |= cs;
1452 *prev_sig = (ui16)(new_sig);
1453
1454 // vertical integration for the new sig. info.
1455 t = new_sig;
1456 new_sig |= (t & 0x7777) << 1; //above neighbors
1457 new_sig |= (t & 0xEEEE) >> 1; //below neighbors
1458 // add sig. info. from the row on top and below
1459 prev = new_sig | u;
1460 // we need only the bits in 0xF000
1461 prev &= 0xF000;
1462 }
1463 }
1464 }
1465
1466 // We perform Magnitude Refinement Pass here
1467 if (num_passes > 2)
1468 {
1469 // de-stuff the MRP segment; consumption is at most 1 bit per
1470 // sample = 512 bytes, so 1024 bytes always suffice
1471 const ui32 mrp_cap = 1024;
1472 ui8 mrp_buf[mrp_cap + 72];
1473 ui32 mrp_limit = destuff_mrp(coded_data, (int)lengths1,
1474 (int)lengths2, mrp_buf, mrp_cap);
1475 ui32 mrp_pos = 0;
1476
1477 for (ui32 y = 0; y < height; y += 4)
1478 {
1479 ui16 *cur_sig = sigma + (y >> 2) * mstr;
1480 ui32 *dpp = decoded_data + y * stride;
1481 for (ui32 i = 0; i < width; i += 4, dpp += 4)
1482 {
1483 //Process one entry from sigma array at a time
1484 // Each nibble (4 bits) in the sigma array represents 4 rows,
1485 ui64 cwd = dfetch64(mrp_buf, mrp_limit, mrp_pos);
1486 ui16 sig = *cur_sig++; // 16 bit that will be processed now
1487 int total_bits = 0;
1488 if (sig) // if any of the 32 bits are set
1489 {
1490 // We work on 4 rows, with 4 samples each, since
1491 // data is 32 bit (4 bytes)
1492
1493 // spread the 16 bits in sig to 0 or 1 bytes in sig_vec
1494 __m128i sig_vec = _mm_set1_epi16((si16)sig);
1495 sig_vec = _mm_shuffle_epi8(sig_vec,
1496 _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
1497 sig_vec = _mm_and_si128(sig_vec,
1498 _mm_set1_epi64x((si64)0x8040201008040201));
1499 sig_vec = _mm_cmpeq_epi8(sig_vec,
1500 _mm_set1_epi64x((si64)0x8040201008040201));
1501 sig_vec = _mm_abs_epi8(sig_vec);
1502
1503 // find cumulative sums
1504 // to find which bit in cwd we should extract
1505 __m128i inc_sum = sig_vec; // inclusive scan
1506 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 1));
1507 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 2));
1508 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 4));
1509 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 8));
1510 total_bits = _mm_extract_epi16(inc_sum, 7) >> 8;
1511 __m128i ex_sum = _mm_bslli_si128(inc_sum, 1); // exclusive scan
1512
1513 // Spread the 16 bits in cwd to inverted 0 or 1 bytes in
1514 // cwd_vec. Then, convert these to a form suitable
1515 // for coefficient modifications; in particular, a value
1516 // of 0 is presented as binary 11, and a value of 1 is
1517 // represented as binary 01
1518 __m128i cwd_vec = _mm_set1_epi16((si16)cwd);
1519 cwd_vec = _mm_shuffle_epi8(cwd_vec,
1520 _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
1521 cwd_vec = _mm_and_si128(cwd_vec,
1522 _mm_set1_epi64x((si64)0x8040201008040201));
1523 cwd_vec = _mm_cmpeq_epi8(cwd_vec,
1524 _mm_set1_epi64x((si64)0x8040201008040201));
1525 cwd_vec = _mm_add_epi8(cwd_vec, _mm_set1_epi8(1));
1526 cwd_vec = _mm_add_epi8(cwd_vec, cwd_vec);
1527 cwd_vec = _mm_or_si128(cwd_vec, _mm_set1_epi8(1));
1528
1529 // load data and insert the mrp bit
1530 __m128i m =
1531 _mm_set_epi8(-1,-1,-1,12,-1,-1,-1,8,-1,-1,-1,4,-1,-1,-1,0);
1532 ui32 *dp = dpp;
1533 for (int c = 0; c < 4; ++c) {
1534 __m128i s0, s0_sig, s0_idx, s0_val;
1535 // load coefficients
1536 s0 = _mm_load_si128((__m128i*)dp);
1537 // find significant samples in this row
1538 s0_sig = _mm_shuffle_epi8(sig_vec, m);
1539 s0_sig = _mm_cmpeq_epi8(s0_sig, _mm_setzero_si128());
1540 // get MRP bit index, and MRP pattern
1541 s0_idx = _mm_shuffle_epi8(ex_sum, m);
1542 s0_val = _mm_shuffle_epi8(cwd_vec, s0_idx);
1543 // keep data from significant samples only
1544 s0_val = _mm_andnot_si128(s0_sig, s0_val);
1545 // move mrp bits to correct position, and employ
1546 s0_val = _mm_slli_epi32(s0_val, (si32)p - 2);
1547 s0 = _mm_xor_si128(s0, s0_val);
1548 // store coefficients
1549 _mm_store_si128((__m128i*)dp, s0);
1550 // prepare for next row
1551 dp += stride;
1552 m = _mm_add_epi32(m, _mm_set1_epi32(1));
1553 }
1554 }
1555 // consume data according to the number of bits set
1556 mrp_pos += (ui32)total_bits;
1557 }
1558 }
1559 }
1560 }
1561
1562 //************************************************************************/
1569 void decode_cb_step1_vlc(ui16* scratch, ui8* coded_data, int lcup,
1570 int scup, ui32 width, ui32 height, ui32 sstr)
1571 {
1572 // init structures
1573 dec_mel_st mel;
1574 mel_init(&mel, coded_data, lcup, scup);
1575
1576 // de-stuff the VLC segment; its size is at most scup - 1 < 4095
1577 // bytes (scup is a 12-bit value), and consumption per quad pair is
1578 // at most 7 + 7 + 30 bits, so 4096 bytes always suffice
1579 const ui32 vlc_cap = 4096;
1580 ui8 vlc_buf[vlc_cap + 72];
1581 ui32 vlc_limit = destuff_vlc(coded_data, lcup, scup,
1582 vlc_buf, vlc_cap);
1583 ui32 vlc_off = 0;
1584 ui32 vlc_bits = 0;
1585
1586 int run = mel_get_run(&mel); // decode runs of events from MEL bitstrm
1587 // data represented as runs of 0 events
1588 // See mel_decode description
1589
1590 ui64 vlc_val = 0;
1591 ui32 c_q = 0;
1592 ui16 *sp = scratch;
1593 //initial quad row
1594 for (ui32 x = 0; x < width; sp += 4)
1595 {
1596 // decode VLC
1598
1599 // first quad
1600 drefill(vlc_val, vlc_bits, vlc_off, vlc_buf, vlc_limit);
1601
1602 //decode VLC using the context c_q and the head of VLC bitstream
1603 ui16 t0 = vlc_tbl0[ c_q + (vlc_val & 0x7F) ];
1604
1605 // if context is zero, use one MEL event
1606 if (c_q == 0) //zero context
1607 {
1608 run -= 2; //subtract 2, since events number if multiplied by 2
1609
1610 // Is the run terminated in 1? if so, use decoded VLC code,
1611 // otherwise, discard decoded data, since we will decoded again
1612 // using a different context
1613 t0 = (run == -1) ? t0 : 0;
1614
1615 // is run -1 or -2? this means a run has been consumed
1616 if (run < 0)
1617 run = mel_get_run(&mel); // get another run
1618 }
1619 //run -= (c_q == 0) ? 2 : 0;
1620 //t0 = (c_q != 0 || run == -1) ? t0 : 0;
1621 //if (run < 0)
1622 // run = mel_get_run(&mel); // get another run
1623 sp[0] = t0;
1624 x += 2;
1625
1626 // prepare context for the next quad; eqn. 1 in ITU T.814
1627 c_q = ((t0 & 0x10U) << 3) | ((t0 & 0xE0U) << 2);
1628
1629 //remove data from vlc stream (0 bits are removed if vlc is not used)
1630 dconsume(vlc_val, vlc_bits, t0 & 0x7u);
1631
1632 //second quad
1633 ui16 t1 = 0;
1634
1635 //decode VLC using the context c_q and the head of VLC bitstream
1636 t1 = vlc_tbl0[c_q + (vlc_val & 0x7F)];
1637
1638 // if context is zero, use one MEL event
1639 if (c_q == 0 && x < width) //zero context
1640 {
1641 run -= 2; //subtract 2, since events number if multiplied by 2
1642
1643 // if event is 0, discard decoded t1
1644 t1 = (run == -1) ? t1 : 0;
1645
1646 if (run < 0) // have we consumed all events in a run
1647 run = mel_get_run(&mel); // if yes, then get another run
1648 }
1649 t1 = x < width ? t1 : 0;
1650 //run -= (c_q == 0 && x < width) ? 2 : 0;
1651 //t1 = (c_q != 0 || run == -1) ? t1 : 0;
1652 //if (run < 0)
1653 // run = mel_get_run(&mel); // get another run
1654 sp[2] = t1;
1655 x += 2;
1656
1657 //prepare context for the next quad, eqn. 1 in ITU T.814
1658 c_q = ((t1 & 0x10U) << 3) | ((t1 & 0xE0U) << 2);
1659
1660 //remove data from vlc stream, if qinf is not used, cwdlen is 0
1661 dconsume(vlc_val, vlc_bits, t1 & 0x7u);
1662
1663 // decode u
1665 // uvlc_mode is made up of u_offset bits from the quad pair
1666 ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4);
1667 if (uvlc_mode == 0xc0)// if both u_offset are set, get an event from
1668 { // the MEL run of events
1669 run -= 2; //subtract 2, since events number if multiplied by 2
1670
1671 uvlc_mode += (run == -1) ? 0x40 : 0; // increment uvlc_mode by
1672 // is 0x40
1673
1674 if (run < 0)//if run is consumed (run is -1 or -2), get another run
1675 run = mel_get_run(&mel);
1676 }
1677 //run -= (uvlc_mode == 0xc0) ? 2 : 0;
1678 //uvlc_mode += (uvlc_mode == 0xc0 && run == -1) ? 0x40 : 0;
1679 //if (run < 0)
1680 // run = mel_get_run(&mel); // get another run
1681
1682 //decode uvlc_mode to get u for both quads
1683 ui32 uvlc_entry = uvlc_tbl0[uvlc_mode + (vlc_val & 0x3F)];
1684 //remove total prefix length
1685 dconsume(vlc_val, vlc_bits, uvlc_entry & 0x7u);
1686 uvlc_entry >>= 3;
1687 //extract suffixes for quad 0 and 1
1688 ui32 len = uvlc_entry & 0xF; //suffix length for 2 quads
1689 ui32 tmp = (ui32)vlc_val & ((1u << len) - 1); //suffix value, 2 quads
1690 dconsume(vlc_val, vlc_bits, len);
1691 uvlc_entry >>= 4;
1692 // quad 0 length
1693 len = uvlc_entry & 0x7; // quad 0 suffix length
1694 uvlc_entry >>= 3;
1695 ui16 u_q = (ui16)(1 + (uvlc_entry&7) + (tmp&~(0xFFU<<len))); //kap. 1
1696 sp[1] = u_q;
1697 u_q = (ui16)(1 + (uvlc_entry >> 3) + (tmp >> len)); //kappa == 1
1698 sp[3] = u_q;
1699 }
1700 sp[0] = sp[1] = 0;
1701
1702 //non initial quad rows
1703 for (ui32 y = 2; y < height; y += 2)
1704 {
1705 c_q = 0; // context
1706 ui16 *sp = scratch + (y >> 1) * sstr; // this row of quads
1707
1708 for (ui32 x = 0; x < width; sp += 4)
1709 {
1710 // decode VLC
1712
1713 // sigma_q (n, ne, nf)
1714 c_q |= ((sp[0 - (si32)sstr] & 0xA0U) << 2);
1715 c_q |= ((sp[2 - (si32)sstr] & 0x20U) << 4);
1716
1717 // first quad
1718 drefill(vlc_val, vlc_bits, vlc_off, vlc_buf, vlc_limit);
1719
1720 //decode VLC using the context c_q and the head of VLC bitstream
1721 ui16 t0 = vlc_tbl1[ c_q + (vlc_val & 0x7F) ];
1722
1723 // if context is zero, use one MEL event
1724 if (c_q == 0) //zero context
1725 {
1726 run -= 2; //subtract 2, since events number is multiplied by 2
1727
1728 // Is the run terminated in 1? if so, use decoded VLC code,
1729 // otherwise, discard decoded data, since we will decoded again
1730 // using a different context
1731 t0 = (run == -1) ? t0 : 0;
1732
1733 // is run -1 or -2? this means a run has been consumed
1734 if (run < 0)
1735 run = mel_get_run(&mel); // get another run
1736 }
1737 //run -= (c_q == 0) ? 2 : 0;
1738 //t0 = (c_q != 0 || run == -1) ? t0 : 0;
1739 //if (run < 0)
1740 // run = mel_get_run(&mel); // get another run
1741 sp[0] = t0;
1742 x += 2;
1743
1744 // prepare context for the next quad; eqn. 2 in ITU T.814
1745 // sigma_q (w, sw)
1746 c_q = ((t0 & 0x40U) << 2) | ((t0 & 0x80U) << 1);
1747 // sigma_q (nw)
1748 c_q |= sp[0 - (si32)sstr] & 0x80;
1749 // sigma_q (n, ne, nf)
1750 c_q |= ((sp[2 - (si32)sstr] & 0xA0U) << 2);
1751 c_q |= ((sp[4 - (si32)sstr] & 0x20U) << 4);
1752
1753 //remove data from vlc stream (0 bits are removed if vlc is unused)
1754 dconsume(vlc_val, vlc_bits, t0 & 0x7u);
1755
1756 //second quad
1757 ui16 t1 = 0;
1758
1759 //decode VLC using the context c_q and the head of VLC bitstream
1760 t1 = vlc_tbl1[ c_q + (vlc_val & 0x7F)];
1761
1762 // if context is zero, use one MEL event
1763 if (c_q == 0 && x < width) //zero context
1764 {
1765 run -= 2; //subtract 2, since events number if multiplied by 2
1766
1767 // if event is 0, discard decoded t1
1768 t1 = (run == -1) ? t1 : 0;
1769
1770 if (run < 0) // have we consumed all events in a run
1771 run = mel_get_run(&mel); // if yes, then get another run
1772 }
1773 t1 = x < width ? t1 : 0;
1774 //run -= (c_q == 0 && x < width) ? 2 : 0;
1775 //t1 = (c_q != 0 || run == -1) ? t1 : 0;
1776 //if (run < 0)
1777 // run = mel_get_run(&mel); // get another run
1778 sp[2] = t1;
1779 x += 2;
1780
1781 // partial c_q, will be completed when we process the next quad
1782 // sigma_q (w, sw)
1783 c_q = ((t1 & 0x40U) << 2) | ((t1 & 0x80U) << 1);
1784 // sigma_q (nw)
1785 c_q |= sp[2 - (si32)sstr] & 0x80;
1786
1787 //remove data from vlc stream, if qinf is not used, cwdlen is 0
1788 dconsume(vlc_val, vlc_bits, t1 & 0x7u);
1789
1790 // decode u using wide UVLC table
1792 ui32 uvlc_mode = (((t0 >> 3) & 1) | (((t1 >> 3) & 1) << 1));
1793 ui32 uvlc_entry =
1794 uvlc_tbl1_wide[(uvlc_mode << 10) | (vlc_val & 0x3FF)];
1795 ui32 total_bits = uvlc_entry & 0x1F;
1796 if (total_bits < 0x1F) {
1797 sp[1] = (ui16)((uvlc_entry >> 5) & 0xFF);
1798 sp[3] = (ui16)((uvlc_entry >> 13) & 0xFF);
1799 dconsume(vlc_val, vlc_bits, total_bits);
1800 } else {
1801 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4);
1802 uvlc_entry = uvlc_tbl1[uvlc_mode + (vlc_val & 0x3F)];
1803 dconsume(vlc_val, vlc_bits, uvlc_entry & 0x7u);
1804 uvlc_entry >>= 3;
1805 ui32 len = uvlc_entry & 0xF;
1806 ui32 tmp = (ui32)vlc_val & ((1u << len) - 1);
1807 dconsume(vlc_val, vlc_bits, len);
1808 uvlc_entry >>= 4;
1809 len = uvlc_entry & 0x7;
1810 uvlc_entry >>= 3;
1811 sp[1] = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFFU << len)));
1812 sp[3] = (ui16)((uvlc_entry >> 3) + (tmp >> len));
1813 }
1814 }
1815 sp[0] = sp[1] = 0;
1816 }
1817 }
1818
1819 bool ojph_decode_codeblock_avx2(ui8* coded_data, ui32* decoded_data,
1820 ui32 missing_msbs, ui32 num_passes,
1821 ui32 lengths1, ui32 lengths2,
1822 ui32 width, ui32 height, ui32 stride,
1823 bool stripe_causal)
1824 {
1825 static bool insufficient_precision = false;
1826 static bool modify_code = false;
1827 static bool truncate_spp_mrp = false;
1828
1829 if (num_passes > 1 && lengths2 == 0)
1830 {
1831 OJPH_WARN(0x00010001, "A malformed codeblock that has more than "
1832 "one coding pass, but zero length for "
1833 "2nd and potential 3rd pass.");
1834 num_passes = 1;
1835 }
1836
1837 if (num_passes > 3)
1838 {
1839 OJPH_WARN(0x00010002, "We do not support more than 3 coding passes; "
1840 "This codeblocks has %d passes.",
1841 num_passes);
1842 return false;
1843 }
1844
1845 if (missing_msbs > 30) // p < 0
1846 {
1847 if (insufficient_precision == false)
1848 {
1849 insufficient_precision = true;
1850 OJPH_WARN(0x00010003, "32 bits are not enough to decode this "
1851 "codeblock. This message will not be "
1852 "displayed again.");
1853 }
1854 return false;
1855 }
1856 else if (missing_msbs == 30) // p == 0
1857 { // not enough precision to decode and set the bin center to 1
1858 if (modify_code == false) {
1859 modify_code = true;
1860 OJPH_WARN(0x00010004, "Not enough precision to decode the cleanup "
1861 "pass. The code can be modified to support "
1862 "this case. This message will not be "
1863 "displayed again.");
1864 }
1865 return false; // 32 bits are not enough to decode this
1866 }
1867 else if (missing_msbs == 29) // if p is 1, then num_passes must be 1
1868 {
1869 if (num_passes > 1) {
1870 num_passes = 1;
1871 if (truncate_spp_mrp == false) {
1872 truncate_spp_mrp = true;
1873 OJPH_WARN(0x00010005, "Not enough precision to decode the SgnProp "
1874 "nor MagRef passes; both will be skipped. "
1875 "This message will not be displayed "
1876 "again.");
1877 }
1878 }
1879 }
1880 ui32 p = 30 - missing_msbs; // The least significant bitplane for CUP
1881 // There is a way to handle the case of p == 0, but a different path
1882 // is required
1883
1884 if (lengths1 < 2)
1885 {
1886 OJPH_WARN(0x00010006, "Wrong codeblock length.");
1887 return false;
1888 }
1889
1890 // read scup and fix the bytes there
1891 int lcup, scup;
1892 lcup = (int)lengths1; // length of CUP
1893 //scup is the length of MEL + VLC
1894 scup = (((int)coded_data[lcup-1]) << 4) + (coded_data[lcup-2] & 0xF);
1895 if (scup < 2 || scup > lcup || scup > 4079) //something is wrong
1896 return false;
1897
1898 // The temporary storage scratch holds two types of data in an
1899 // interleaved fashion. The interleaving allows us to use one
1900 // memory pointer.
1901 // We have one entry for a decoded VLC code, and one entry for UVLC.
1902 // Entries are 16 bits each, corresponding to one quad,
1903 // but since we want to use XMM registers of the SSE family
1904 // of SIMD; we allocated 16 bytes or more per quad row; that is,
1905 // the width is no smaller than 16 bytes (or 8 entries), and the
1906 // height is 512 quads
1907 // Each VLC entry contains, in the following order, starting
1908 // from MSB
1909 // e_k (4bits), e_1 (4bits), rho (4bits), useless for step 2 (4bits)
1910 // Each entry in UVLC contains u_q
1911 // One extra row to handle the case of SPP propagating downwards
1912 // when codeblock width is 4
1913 // We need an extra two entries (one inf and one u_q) beyond
1914 // the last column.
1915 // If the block width is 4 (2 quads), then we use sstr of 8
1916 // (enough for 4 quads). If width is 8 (4 quads) we use
1917 // sstr is 16 (enough for 8 quads). For a width of 16 (8
1918 // quads), we use 24 (enough for 12 quads).
1919 ui32 sstr = ((width + 2u) + 7u) & ~7u; // multiples of 8
1920
1921#ifdef __MINGW64__
1922 ui16 scratch[8 * 513] = {0};
1923#else
1924 ui16 scratch[8 * 513];
1925 ui32 quad_rows = (height + 1u) >> 1;
1926 size_t scratch_zero = (size_t)(quad_rows + 1) * sstr;
1927 if (scratch_zero > 8 * 513) scratch_zero = 8 * 513;
1928 memset(scratch, 0, scratch_zero * sizeof(ui16));
1929#endif
1930
1931 assert((stride & 0x3) == 0);
1932
1933 ui32 mmsbp2 = missing_msbs + 2;
1934
1935 // The cleanup pass is decoded in two steps; in step one,
1936 // the VLC and MEL segments are decoded, generating a record that
1937 // has 2 bytes per quad. The 2 bytes contain, u, rho, e^1 & e^k.
1938 // This information should be sufficient for the next step.
1939 // In step 2, we decode the MagSgn segment.
1940
1941 // step 1: decode VLC and MEL segments into scratch
1942 decode_cb_step1_vlc(scratch, coded_data, lcup, scup, width, height, sstr);
1943
1944 // step2 we decode magsgn
1945 // mmsbp2 equals K_max + 1 (we decode up to K_max bits + 1 sign bit)
1946 // The 32 bit path decode 16 bits data, for which one would think
1947 // 16 bits are enough, because we want to put in the center of the
1948 // bin.
1949 // If you have mmsbp2 equals 16 bit, and reversible coding, and
1950 // no bitplanes are missing, then we can decoding using the 16 bit
1951 // path, but we are not doing this here.
1952 if (mmsbp2 >= 16)
1953 {
1954 if (!decode_cb_step2_32bit(scratch, decoded_data, coded_data,
1955 width, height, stride, sstr, p, mmsbp2,
1956 lcup, scup))
1957 return false;
1958 }
1959 else {
1960 if (!decode_cb_step2_16bit(scratch, decoded_data, coded_data,
1961 width, height, stride, sstr, p, mmsbp2,
1962 lcup, scup))
1963 return false;
1964 }
1965
1966 if (num_passes > 1)
1967 decode_cb_spp_mrp(scratch, decoded_data, coded_data, width, height,
1968 stride, sstr, p, num_passes, lengths1, lengths2,
1969 stripe_causal);
1970
1971 return true;
1972 }
1973 }
1974}
1975
1976#endif
ui32 uvlc_tbl1_wide[4096]
uvlc_tbl1_wide: wider UVLC table for non-initial rows. Index = mode(2 bits) * 1024 + vlc_data(10 bits...
ui16 uvlc_tbl0[256+64]
uvlc_tbl0 contains decoding information for initial row of quads
ui16 uvlc_tbl1[256]
uvlc_tbl1 contains decoding information for non-initial row of quads
ui16 vlc_tbl1[1024]
vlc_tbl1 contains decoding information for non-initial row of quads
ui16 vlc_tbl0[1024]
vlc_tbl0 contains decoding information for initial row of quads
static void mel_read(dec_mel_st *melp)
Reads and unstuffs the MEL bitstream.
static int mel_get_run(dec_mel_st *melp)
Retrieves one run from dec_mel_st; if there are no runs stored MEL segment is decoded.
static void mel_init(dec_mel_st *melp, ui8 *bbuf, int lcup, int scup)
Initiates a dec_mel_st structure for MEL decoding and reads some bytes in order to get the read addre...
bool ojph_decode_codeblock_avx2(ui8 *coded_data, ui32 *decoded_data, ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2, ui32 width, ui32 height, ui32 stride, bool stripe_causal)
static void mel_decode(dec_mel_st *melp)
Decodes unstuffed MEL segment bits stored in tmp to runs.
int64_t si64
Definition ojph_defs.h:57
uint64_t ui64
Definition ojph_defs.h:56
uint16_t ui16
Definition ojph_defs.h:52
int32_t si32
Definition ojph_defs.h:55
int16_t si16
Definition ojph_defs.h:53
uint32_t ui32
Definition ojph_defs.h:54
uint8_t ui8
Definition ojph_defs.h:50
#define OJPH_NO_INLINE
Definition ojph_arch.h:75
#define OJPH_FORCE_INLINE
Definition ojph_arch.h:74
#define ojph_max(a, b)
Definition ojph_defs.h:73
#define OJPH_WARN(t,...)
MEL state structure for reading and decoding the MEL bitstream.
bool unstuff
true if the next bit needs to be unstuffed
int num_runs
number of decoded runs left in runs (maximum 8)
int size
number of bytes in MEL code
ui8 * data
the address of data (or bitstream)
int k
state of MEL decoder
int bits
number of bits stored in tmp
ui64 tmp
temporary buffer for read data
ui64 runs
runs of decoded MEL codewords (7 bits/run)