44#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
102 ui32 val = 0xFFFFFFFF;
103 if (melp->size > 4) {
104 memcpy(&val, melp->data, 4);
108 else if (melp->size > 0)
111 while (melp->size > 1) {
112 ui32 v = *melp->data++;
113 ui32 m = ~(0xFFu << i);
114 val = (val & m) | (v << i);
119 ui32 v = *melp->data++;
121 ui32 m = ~(0xFFu << i);
122 val = (val & m) | (v << i);
127 int bits = 32 - melp->unstuff;
134 bool unstuff = ((val & 0xFF) == 0xFF);
136 t = t << (8 - unstuff);
139 t |= (val>>8) & 0xFF;
140 unstuff = (((val >> 8) & 0xFF) == 0xFF);
142 t = t << (8 - unstuff);
144 t |= (val>>16) & 0xFF;
145 unstuff = (((val >> 16) & 0xFF) == 0xFF);
147 t = t << (8 - unstuff);
149 t |= (val>>24) & 0xFF;
150 melp->unstuff = (((val >> 24) & 0xFF) == 0xFF);
154 melp->tmp |= ((
ui64)t) << (64 - bits - melp->bits);
176 static const int mel_exp[13] = {
177 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5
186 while (melp->bits >= 6 && melp->num_runs < 8)
188 int eval = mel_exp[melp->k];
190 if (melp->tmp & (1ull<<63))
194 melp->k = melp->k + 1 < 12 ? melp->k + 1 : 12;
201 run = (int)(melp->tmp >> (63 - eval)) & ((1 << eval) - 1);
202 melp->k = melp->k - 1 > 0 ? melp->k - 1 : 0;
203 melp->tmp <<= eval + 1;
204 melp->bits -= eval + 1;
205 run = (run << 1) + 1;
207 eval = melp->num_runs * 7;
208 melp->runs &= ~((
ui64)0x3F << eval);
209 melp->runs |= ((
ui64)run) << eval;
227 melp->data = bbuf + lcup - scup;
230 melp->unstuff =
false;
231 melp->size = scup - 1;
239 int num = 4 - (int)(intptr_t(melp->data) & 0x3);
240 for (
int i = 0; i < num; ++i) {
241 assert(melp->unstuff ==
false || melp->data[0] <= 0x8F);
242 ui64 d = (melp->size > 0) ? *melp->data : 0xFF;
244 if (melp->size == 1) d |= 0xF;
246 melp->data += melp->size-- > 0;
247 int d_bits = 8 - melp->unstuff;
248 melp->tmp = (melp->tmp << d_bits) | d;
249 melp->bits += d_bits;
250 melp->unstuff = ((d & 0xFF) == 0xFF);
253 melp->tmp <<= (64 - melp->bits);
266 if (melp->num_runs == 0)
269 int t = melp->runs & 0x7F;
309 ui8* o_end = dst + cap;
311 const ui8* s_end = src + size;
314 bool prev_ff =
false;
317 while (s + 16 <= s_end && o + 24 <= o_end)
319 __m128i v = _mm_loadu_si128((
const __m128i*)s);
320 int ff = _mm_movemask_epi8(_mm_cmpeq_epi8(v, _mm_set1_epi8(-1)));
321 if (ff != 0 || prev_ff)
323 for (
int i = 0; i < 16; ++i) {
325 acc |= (
ui64)b << nb;
326 nb += prev_ff ? 7u : 8u;
327 prev_ff = (b == 0xFFu);
328 if (nb >= 8) { *o++ = (
ui8)acc; acc >>= 8; nb -= 8; }
334 memcpy(&v1, s + 8, 8);
335 ui64 w0 = acc | (v0 << nb);
336 ui64 w1 = (v1 << nb) | (nb ? (v0 >> (64 - nb)) : 0);
338 memcpy(o + 8, &w1, 8);
339 acc = nb ? (v1 >> (64 - nb)) : 0;
344 while (s < s_end && o < o_end)
347 acc |= (
ui64)b << nb;
348 nb += prev_ff ? 7u : 8u;
349 prev_ff = (b == 0xFFu);
350 if (nb >= 8) { *o++ = (
ui8)acc; acc >>= 8; nb -= 8; }
353 ui32 fill = (X == 0xFF) ? (0xFFu << nb) : 0;
355 __m256i pad = _mm256_set1_epi8((
char)X);
356 _mm256_storeu_si256((__m256i*)(o + 1), pad);
357 _mm256_storeu_si256((__m256i*)(o + 33), pad);
358 return (
ui32)(o - dst) + 1;
376 __m128i dfetch(
const ui8* dbuf,
ui32 limit,
ui32 pos)
379 off = off < limit ? off : limit;
380 const ui8* p = dbuf + off;
381 __m128i v = _mm_loadu_si128((
const __m128i*)p);
382 __m128i w = _mm_loadu_si128((
const __m128i*)(p + 8));
383 int k = (int)(pos & 7);
384 __m128i r = _mm_srl_epi64(v, _mm_cvtsi32_si128(k));
385 __m128i c = _mm_sll_epi64(w, _mm_cvtsi32_si128(64 - k));
386 return _mm_or_si128(r, c);
404 off = off < limit ? off : limit;
406 memcpy(&v, dbuf + off, 8);
407 return v >> (pos & 7);
434 ui32 o = off < limit ? off : limit;
435 memcpy(&v, dbuf + o, 8);
437 off += (63 - bits) >> 3;
486 ui32 destuff_rev(
const ui8* p,
int size,
bool unstuff,
490 ui8* o_end = dst + cap;
495 if (size > 0 && o < o_end)
499 acc |= (
ui64)d << nb;
500 nb += 8 - ((unstuff && ((d & 0x7F) == 0x7F)) ? 1u : 0u);
502 if (nb >= 8) { *o++ = (
ui8)acc; acc >>= 8; nb -= 8; }
506 while (size >= 16 && o + 24 <= o_end)
508 __m128i v = _mm_loadu_si128((
const __m128i*)(p - 15));
509 __m128i nx = _mm_loadu_si128((
const __m128i*)(p - 14));
510 __m128i is7f = _mm_cmpeq_epi8(
511 _mm_and_si128(v, _mm_set1_epi8(0x7F)), _mm_set1_epi8(0x7F));
513 __m128i le8f = _mm_cmpeq_epi8(
514 _mm_subs_epu8(nx, _mm_set1_epi8((
char)0x8F)),
515 _mm_setzero_si128());
516 __m128i stuff = _mm_andnot_si128(le8f, is7f);
517 if (!_mm_testz_si128(stuff, stuff))
519 for (
int i = 0; i < 16; ++i) {
521 acc |= (
ui64)d << nb;
522 nb += 8 - ((unstuff && ((d & 0x7F) == 0x7F)) ? 1u : 0u);
524 if (nb >= 8) { *o++ = (
ui8)acc; acc >>= 8; nb -= 8; }
529 __m128i r = _mm_shuffle_epi8(v,
530 _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,
531 8, 9, 10, 11, 12, 13, 14, 15));
532#ifdef OJPH_ARCH_X86_64
533 ui64 v0 = (
ui64)_mm_cvtsi128_si64(r);
534 ui64 v1 = (
ui64)_mm_extract_epi64(r, 1);
536 ui64 v0 = (
ui32)_mm_cvtsi128_si32(r)
537 | ((
ui64)(
ui32)_mm_extract_epi32(r, 1) << 32);
538 ui64 v1 = (
ui32)_mm_extract_epi32(r, 2)
539 | ((
ui64)(
ui32)_mm_extract_epi32(r, 3) << 32);
541 ui64 w0 = acc | (v0 << nb);
542 ui64 w1 = (v1 << nb) | (nb ? (v0 >> (64 - nb)) : 0);
544 memcpy(o + 8, &w1, 8);
545 acc = nb ? (v1 >> (64 - nb)) : 0;
549 unstuff = p[1] > 0x8F;
552 while (size > 0 && o < o_end)
556 acc |= (
ui64)d << nb;
557 nb += 8 - ((unstuff && ((d & 0x7F) == 0x7F)) ? 1u : 0u);
559 if (nb >= 8) { *o++ = (
ui8)acc; acc >>= 8; nb -= 8; }
563 __m256i z = _mm256_setzero_si256();
564 _mm256_storeu_si256((__m256i*)(o + 1), z);
565 _mm256_storeu_si256((__m256i*)(o + 33), z);
566 return (
ui32)(o - dst) + 1;
586 ui32 destuff_vlc(
const ui8* data,
int lcup,
int scup,
589 const ui8* p = data + lcup - 2;
592 ui32 nb = 4 - ((acc & 7) == 7);
593 bool unstuff = (d | 0xF) > 0x8F;
594 return destuff_rev(p - 1, scup - 2, unstuff, acc, nb, dst, cap);
613 ui32 destuff_mrp(
const ui8* data,
int lcup,
int len2,
616 return destuff_rev(data + lcup + len2 - 1, len2,
true, 0, 0,
631 __m256i decode_two_quad32_avx2(__m256i inf_u_q, __m256i U_q,
633 ui32 p, __m128i& vn) {
634 __m256i row = _mm256_setzero_si256();
637 __m256i flags = _mm256_and_si256(inf_u_q, _mm256_set_epi32(0x8880, 0x4440, 0x2220, 0x1110, 0x8880, 0x4440, 0x2220, 0x1110));
638 __m256i insig = _mm256_cmpeq_epi32(flags, _mm256_setzero_si256());
640 if ((uint32_t)_mm256_movemask_epi8(insig) != (uint32_t)0xFFFFFFFF)
642 flags = _mm256_mullo_epi16(flags, _mm256_set_epi16(1, 1, 2, 2, 4, 4, 8, 8, 1, 1, 2, 2, 4, 4, 8, 8));
650 __m256i w0 = _mm256_srli_epi32(flags, 15);
651 m_n = _mm256_sub_epi32(U_q, w0);
652 m_n = _mm256_andnot_si256(insig, m_n);
656 __m256i inc_sum = m_n;
657 inc_sum = _mm256_add_epi32(inc_sum, _mm256_bslli_epi128(inc_sum, 4));
658 inc_sum = _mm256_add_epi32(inc_sum, _mm256_bslli_epi128(inc_sum, 8));
659 ui32 total_mn1 = (
ui32)_mm256_extract_epi16(inc_sum, 6);
660 ui32 total_mn2 = (
ui32)_mm256_extract_epi16(inc_sum, 14);
662 __m128i ms_vec0 = dfetch(dbuf, limit, pos);
663 __m128i ms_vec1 = dfetch(dbuf, limit, pos + total_mn1);
664 pos += total_mn1 + total_mn2;
666 __m256i ms_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ms_vec0), ms_vec1, 0x1);
668 __m256i ex_sum = _mm256_bslli_epi128(inc_sum, 4);
671 __m256i byte_idx = _mm256_srli_epi32(ex_sum, 3);
672 __m256i bit_idx = _mm256_and_si256(ex_sum, _mm256_set1_epi32(7));
673 byte_idx = _mm256_shuffle_epi8(byte_idx,
674 _mm256_set_epi32(0x0C0C0C0C, 0x08080808, 0x04040404, 0x00000000, 0x0C0C0C0C, 0x08080808, 0x04040404, 0x00000000));
675 byte_idx = _mm256_add_epi32(byte_idx, _mm256_set1_epi32(0x03020100));
676 __m256i d0 = _mm256_shuffle_epi8(ms_vec, byte_idx);
677 byte_idx = _mm256_add_epi32(byte_idx, _mm256_set1_epi32(0x01010101));
678 __m256i d1 = _mm256_shuffle_epi8(ms_vec, byte_idx);
681 bit_idx = _mm256_or_si256(bit_idx, _mm256_slli_epi32(bit_idx, 16));
683 __m128i a = _mm_set_epi8(1, 3, 7, 15, 31, 63, 127, -1, 1, 3, 7, 15, 31, 63, 127, -1);
684 __m256i aa = _mm256_inserti128_si256(_mm256_castsi128_si256(a), a, 0x1);
686 __m256i bit_shift = _mm256_shuffle_epi8(aa, bit_idx);
687 bit_shift = _mm256_add_epi16(bit_shift, _mm256_set1_epi16(0x0101));
688 d0 = _mm256_mullo_epi16(d0, bit_shift);
689 d0 = _mm256_srli_epi16(d0, 8);
690 d1 = _mm256_mullo_epi16(d1, bit_shift);
691 d1 = _mm256_and_si256(d1, _mm256_set1_epi32((
si32)0xFF00FF00));
692 d0 = _mm256_or_si256(d0, d1);
696 __m256i ones = _mm256_set1_epi32(1);
697 __m256i twos = _mm256_set1_epi32(2);
698 __m256i U_q_m1 = _mm256_sub_epi32(U_q, ones);
699 U_q_m1 = _mm256_and_si256(U_q_m1, _mm256_set_epi32(0, 0, 0, 0x1F, 0, 0, 0, 0x1F));
700 U_q_m1 = _mm256_shuffle_epi32(U_q_m1, 0);
701 w0 = _mm256_sub_epi32(twos, w0);
702 shift = _mm256_sllv_epi32(w0, U_q_m1);
703 ms_vec = _mm256_and_si256(d0, _mm256_sub_epi32(shift, ones));
706 w0 = _mm256_and_si256(flags, _mm256_set1_epi32(0x800));
707 w0 = _mm256_cmpeq_epi32(w0, _mm256_setzero_si256());
708 w0 = _mm256_andnot_si256(w0, shift);
709 ms_vec = _mm256_or_si256(ms_vec, w0);
710 w0 = _mm256_slli_epi32(ms_vec, 31);
711 ms_vec = _mm256_or_si256(ms_vec, ones);
712 __m256i tvn = ms_vec;
713 ms_vec = _mm256_add_epi32(ms_vec, twos);
714 ms_vec = _mm256_slli_epi32(ms_vec, (
si32)p - 1);
715 ms_vec = _mm256_or_si256(ms_vec, w0);
716 row = _mm256_andnot_si256(insig, ms_vec);
718 ms_vec = _mm256_andnot_si256(insig, tvn);
720 tvn = _mm256_shuffle_epi8(ms_vec, _mm256_set_epi32(-1, 0x0F0E0D0C, 0x07060504, -1, -1, -1, 0x0F0E0D0C, 0x07060504));
722 vn = _mm_or_si128(vn, _mm256_castsi256_si128(tvn));
723 vn = _mm_or_si128(vn, _mm256_extracti128_si256(tvn, 0x1));
741 __m256i decode_four_quad16(
const __m128i inf_u_q, __m128i U_q,
743 ui32 p, __m128i& vn) {
749 __m256i row = _mm256_setzero_si256();
750 __m128i ddd = _mm_shuffle_epi8(inf_u_q,
751 _mm_set_epi16(0x0d0c, 0x0d0c, 0x0908, 0x908, 0x0504, 0x0504, 0x0100, 0x0100));
752 w0 = _mm256_permutevar8x32_epi32(_mm256_castsi128_si256(ddd),
753 _mm256_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3));
755 flags = _mm256_and_si256(w0,
756 _mm256_set_epi16((
si16)0x8880, 0x4440, 0x2220, 0x1110,
757 (
si16)0x8880, 0x4440, 0x2220, 0x1110,
758 (
si16)0x8880, 0x4440, 0x2220, 0x1110,
759 (
si16)0x8880, 0x4440, 0x2220, 0x1110));
760 insig = _mm256_cmpeq_epi16(flags, _mm256_setzero_si256());
761 if ((uint32_t)_mm256_movemask_epi8(insig) != (uint32_t)0xFFFFFFFF)
763 ddd = _mm_or_si128(_mm_bslli_si128(U_q, 2), U_q);
764 __m256i U_q_avx = _mm256_permutevar8x32_epi32(_mm256_castsi128_si256(ddd),
765 _mm256_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3));
766 flags = _mm256_mullo_epi16(flags, _mm256_set_epi16(1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8));
774 w0 = _mm256_srli_epi16(flags, 15);
775 m_n = _mm256_sub_epi16(U_q_avx, w0);
776 m_n = _mm256_andnot_si256(insig, m_n);
780 __m256i inc_sum = m_n;
781 inc_sum = _mm256_add_epi16(inc_sum, _mm256_bslli_epi128(inc_sum, 2));
782 inc_sum = _mm256_add_epi16(inc_sum, _mm256_bslli_epi128(inc_sum, 4));
783 inc_sum = _mm256_add_epi16(inc_sum, _mm256_bslli_epi128(inc_sum, 8));
784 ui32 total_mn1 = (
ui32)_mm256_extract_epi16(inc_sum, 7);
785 ui32 total_mn2 = (
ui32)_mm256_extract_epi16(inc_sum, 15);
786 __m256i ex_sum = _mm256_bslli_epi128(inc_sum, 2);
788 __m128i ms_vec0 = dfetch(dbuf, limit, pos);
789 __m128i ms_vec1 = dfetch(dbuf, limit, pos + total_mn1);
790 pos += total_mn1 + total_mn2;
792 __m256i ms_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ms_vec0), ms_vec1, 0x1);
795 __m256i byte_idx = _mm256_srli_epi16(ex_sum, 3);
796 __m256i bit_idx = _mm256_and_si256(ex_sum, _mm256_set1_epi16(7));
797 byte_idx = _mm256_shuffle_epi8(byte_idx,
798 _mm256_set_epi16(0x0E0E, 0x0C0C, 0x0A0A, 0x0808,
799 0x0606, 0x0404, 0x0202, 0x0000, 0x0E0E, 0x0C0C, 0x0A0A, 0x0808,
800 0x0606, 0x0404, 0x0202, 0x0000));
801 byte_idx = _mm256_add_epi16(byte_idx, _mm256_set1_epi16(0x0100));
802 __m256i d0 = _mm256_shuffle_epi8(ms_vec, byte_idx);
803 byte_idx = _mm256_add_epi16(byte_idx, _mm256_set1_epi16(0x0101));
804 __m256i d1 = _mm256_shuffle_epi8(ms_vec, byte_idx);
807 __m256i bit_shift = _mm256_shuffle_epi8(
808 _mm256_set_epi8(1, 3, 7, 15, 31, 63, 127, -1,
809 1, 3, 7, 15, 31, 63, 127, -1, 1, 3, 7, 15, 31, 63, 127, -1,
810 1, 3, 7, 15, 31, 63, 127, -1), bit_idx);
811 bit_shift = _mm256_add_epi16(bit_shift, _mm256_set1_epi16(0x0101));
812 d0 = _mm256_mullo_epi16(d0, bit_shift);
813 d0 = _mm256_srli_epi16(d0, 8);
814 d1 = _mm256_mullo_epi16(d1, bit_shift);
815 d1 = _mm256_and_si256(d1, _mm256_set1_epi16((
si16)0xFF00));
816 d0 = _mm256_or_si256(d0, d1);
820 __m256i ones = _mm256_set1_epi16(1);
821 __m256i twos = _mm256_set1_epi16(2);
828 __m256i kq = _mm256_sub_epi16(U_q_avx, ones);
829 __m256i idx = _mm256_or_si256(kq,
830 _mm256_slli_epi16(_mm256_sub_epi16(kq,
831 _mm256_set1_epi16(8)), 8));
832 const __m256i pow2_tbl = _mm256_setr_epi8(
833 1, 2, 4, 8, 16, 32, 64, (
char)128, 0, 0, 0, 0, 0, 0, 0, 0,
834 1, 2, 4, 8, 16, 32, 64, (
char)128, 0, 0, 0, 0, 0, 0, 0, 0);
835 __m256i pow2 = _mm256_shuffle_epi8(pow2_tbl, idx);
836 w0 = _mm256_sub_epi16(twos, w0);
837 shift = _mm256_mullo_epi16(w0, pow2);
838 ms_vec = _mm256_and_si256(d0, _mm256_sub_epi16(shift, ones));
841 w0 = _mm256_and_si256(flags, _mm256_set1_epi16(0x800));
842 w0 = _mm256_cmpeq_epi16(w0, _mm256_setzero_si256());
843 w0 = _mm256_andnot_si256(w0, shift);
844 ms_vec = _mm256_or_si256(ms_vec, w0);
845 w0 = _mm256_slli_epi16(ms_vec, 15);
846 ms_vec = _mm256_or_si256(ms_vec, ones);
847 __m256i tvn = ms_vec;
848 ms_vec = _mm256_add_epi16(ms_vec, twos);
849 ms_vec = _mm256_slli_epi16(ms_vec, (
si32)p - 1);
850 ms_vec = _mm256_or_si256(ms_vec, w0);
851 row = _mm256_andnot_si256(insig, ms_vec);
853 ms_vec = _mm256_andnot_si256(insig, tvn);
855 __m256i ms_vec_shuffle1 = _mm256_shuffle_epi8(ms_vec,
856 _mm256_set_epi16(-1, -1, -1, -1, 0x0706, 0x0302, -1, -1,
857 -1, -1, -1, -1, -1, -1, 0x0706, 0x0302));
858 __m256i ms_vec_shuffle2 = _mm256_shuffle_epi8(ms_vec,
859 _mm256_set_epi16(-1, -1, -1, 0x0F0E, 0x0B0A, -1, -1, -1,
860 -1, -1, -1, -1, -1, 0x0F0E, 0x0B0A, -1));
861 ms_vec = _mm256_or_si256(ms_vec_shuffle1, ms_vec_shuffle2);
863 vn = _mm_or_si128(vn, _mm256_castsi256_si128(ms_vec));
864 vn = _mm_or_si128(vn, _mm256_extracti128_si256(ms_vec, 0x1));
870 inline __m256i avx2_lzcnt_epi32(__m256i v) {
872 v = _mm256_andnot_si256(_mm256_srli_epi32(v, 8), v);
874 v = _mm256_castps_si256(_mm256_cvtepi32_ps(v));
875 v = _mm256_srli_epi32(v, 23);
876 v = _mm256_subs_epu16(_mm256_set1_epi32(158), v);
877 v = _mm256_min_epi16(v, _mm256_set1_epi32(32));
909 bool decode_cb_step2_16bit(
ui16* scratch,
ui32* decoded_data,
917 const int v_n_size = 512 + 16;
919 ui16 v_n_scratch[v_n_size] = {0};
920 ui32 v_n_scratch_32[v_n_size] = {0};
922 ui16 v_n_scratch[v_n_size];
923 memset(v_n_scratch + (width >> 1) + 4, 0, 8 *
sizeof(
ui16));
924 ui32 v_n_scratch_32[v_n_size];
928 const ui32 dbuf_cap = 4096 * 15 / 8;
929 ui8 dbuf[dbuf_cap + 72];
930 ui32 limit = destuff_frwd<0xFF>(coded_data, lcup - scup, dbuf, dbuf_cap);
935 ui16 *vp = v_n_scratch;
936 ui32 *dp = decoded_data;
939 for (
ui32 x = 0; x < width; x += 8, sp += 8, vp += 4, dp += 8) {
941 __m128i inf_u_q = _mm_loadu_si128((__m128i*)sp);
942 __m128i U_q = _mm_srli_epi32(inf_u_q, 16);
943 __m128i w = _mm_cmpgt_epi32(U_q, _mm_set1_epi32((
int)mmsbp2));
944 if (!_mm_testz_si128(w, w)) {
948 __m128i vn = _mm_set1_epi16(2);
949 __m256i row = decode_four_quad16(inf_u_q, U_q, dbuf, limit, pos, p, vn);
951 w = _mm_cvtsi32_si128(*(
unsigned short const*)(vp));
952 _mm_storeu_si128((__m128i*)vp, _mm_or_si128(w, vn));
954 __m256i w0 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1, 0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1));
955 __m256i w1 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1, 0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1));
957 _mm256_storeu_si256((__m256i*)dp, w0);
958 _mm256_storeu_si256((__m256i*)(dp + stride), w1);
962 for (
ui32 y = 2; y < height; y += 2) {
965 ui16 *vp = v_n_scratch;
966 ui32 *vp_32 = v_n_scratch_32;
968 ui16* sp = scratch + (y >> 1) * sstr;
969 const __m256i avx_mmsbp2 = _mm256_set1_epi32((
int)mmsbp2);
970 const __m256i avx_31 = _mm256_set1_epi32(31);
971 const __m256i avx_f0 = _mm256_set1_epi32(0xF0);
972 const __m256i avx_1 = _mm256_set1_epi32(1);
973 const __m256i avx_0 = _mm256_setzero_si256();
975 for (
ui32 x = 0; x <= width; x += 16, vp += 8, sp += 16, vp_32 += 8) {
976 __m128i v = _mm_loadu_si128((__m128i*)vp);
977 __m128i v_p1 = _mm_loadu_si128((__m128i*)(vp + 1));
978 v = _mm_or_si128(v, v_p1);
980 __m256i v_avx = _mm256_cvtepu16_epi32(v);
981 v_avx = avx2_lzcnt_epi32(v_avx);
982 v_avx = _mm256_sub_epi32(avx_31, v_avx);
984 __m256i inf_u_q = _mm256_loadu_si256((__m256i*)sp);
985 __m256i gamma = _mm256_and_si256(inf_u_q, avx_f0);
986 __m256i w0 = _mm256_sub_epi32(gamma, avx_1);
987 gamma = _mm256_and_si256(gamma, w0);
988 gamma = _mm256_cmpeq_epi32(gamma, avx_0);
990 v_avx = _mm256_andnot_si256(gamma, v_avx);
991 v_avx = _mm256_max_epi32(v_avx, avx_1);
993 inf_u_q = _mm256_srli_epi32(inf_u_q, 16);
994 v_avx = _mm256_add_epi32(inf_u_q, v_avx);
996 w0 = _mm256_cmpgt_epi32(v_avx, avx_mmsbp2);
997 if (!_mm256_testz_si256(w0, w0)) {
1001 _mm256_storeu_si256((__m256i*)vp_32, v_avx);
1005 ui16 *vp = v_n_scratch;
1006 ui32* vp_32 = v_n_scratch_32;
1007 ui16 *sp = scratch + (y >> 1) * sstr;
1008 ui32 *dp = decoded_data + y * stride;
1011 for (
ui32 x = 0; x < width; x += 8, sp += 8, vp += 4, dp += 8, vp_32 += 4) {
1013 __m128i inf_u_q = _mm_loadu_si128((__m128i*)sp);
1014 __m128i U_q = _mm_loadu_si128((__m128i*)vp_32);
1016 __m128i vn = _mm_set1_epi16(2);
1017 __m256i row = decode_four_quad16(inf_u_q, U_q, dbuf, limit, pos, p, vn);
1019 __m128i w = _mm_cvtsi32_si128(*(
unsigned short const*)(vp));
1020 _mm_storeu_si128((__m128i*)vp, _mm_or_si128(w, vn));
1022 __m256i w0 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1, 0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1));
1023 __m256i w1 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1, 0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1));
1025 _mm256_storeu_si256((__m256i*)dp, w0);
1026 _mm256_storeu_si256((__m256i*)(dp + stride), w1);
1041 bool decode_cb_step2_32bit(
ui16* scratch,
ui32* decoded_data,
1046 const int v_n_size = 512 + 16;
1048 ui32 v_n_scratch[2 * v_n_size] = {0};
1050 ui32 v_n_scratch[2 * v_n_size];
1051 memset(v_n_scratch + (width >> 1) + 2, 0, 14 *
sizeof(
ui32));
1055 const ui32 dbuf_cap = 4096 * 32 / 8;
1056 ui8 dbuf[dbuf_cap + 72];
1057 ui32 limit = destuff_frwd<0xFF>(coded_data, lcup - scup, dbuf, dbuf_cap);
1060 const __m256i avx_mmsbp2 = _mm256_set1_epi32((
int)mmsbp2);
1064 ui32 *vp = v_n_scratch;
1065 ui32 *dp = decoded_data;
1068 for (
ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
1070 __m128i vn = _mm_set1_epi32(2);
1072 __m256i inf_u_q = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)sp));
1073 inf_u_q = _mm256_permutevar8x32_epi32(inf_u_q, _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1));
1075 __m256i U_q = _mm256_srli_epi32(inf_u_q, 16);
1076 __m256i w = _mm256_cmpgt_epi32(U_q, avx_mmsbp2);
1077 if (!_mm256_testz_si256(w, w)) {
1081 __m256i row = decode_two_quad32_avx2(inf_u_q, U_q, dbuf, limit, pos, p, vn);
1082 row = _mm256_permutevar8x32_epi32(row, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
1083 _mm_store_si128((__m128i*)dp, _mm256_castsi256_si128(row));
1084 _mm_store_si128((__m128i*)(dp + stride), _mm256_extracti128_si256(row, 0x1));
1086 __m128i w0 = _mm_cvtsi32_si128(*(
int const*)vp);
1087 w0 = _mm_or_si128(w0, vn);
1088 _mm_storeu_si128((__m128i*)vp, w0);
1092 for (
ui32 y = 2; y < height; y += 2)
1096 ui32 *vp = v_n_scratch;
1097 ui16* sp = scratch + (y >> 1) * sstr;
1099 const __m256i avx_31 = _mm256_set1_epi32(31);
1100 const __m256i avx_f0 = _mm256_set1_epi32(0xF0);
1101 const __m256i avx_1 = _mm256_set1_epi32(1);
1102 const __m256i avx_0 = _mm256_setzero_si256();
1104 for (
ui32 x = 0; x <= width; x += 16, vp += 8, sp += 16) {
1105 __m256i v = _mm256_loadu_si256((__m256i*)vp);
1106 __m256i v_p1 = _mm256_loadu_si256((__m256i*)(vp + 1));
1107 v = _mm256_or_si256(v, v_p1);
1108 v = avx2_lzcnt_epi32(v);
1109 v = _mm256_sub_epi32(avx_31, v);
1111 __m256i inf_u_q = _mm256_loadu_si256((__m256i*)sp);
1112 __m256i gamma = _mm256_and_si256(inf_u_q, avx_f0);
1113 __m256i w0 = _mm256_sub_epi32(gamma, avx_1);
1114 gamma = _mm256_and_si256(gamma, w0);
1115 gamma = _mm256_cmpeq_epi32(gamma, avx_0);
1117 v = _mm256_andnot_si256(gamma, v);
1118 v = _mm256_max_epi32(v, avx_1);
1120 inf_u_q = _mm256_srli_epi32(inf_u_q, 16);
1121 v = _mm256_add_epi32(inf_u_q, v);
1123 w0 = _mm256_cmpgt_epi32(v, avx_mmsbp2);
1124 if (!_mm256_testz_si256(w0, w0)) {
1128 _mm256_storeu_si256((__m256i*)(vp + v_n_size), v);
1132 ui32 *vp = v_n_scratch;
1133 ui16 *sp = scratch + (y >> 1) * sstr;
1134 ui32 *dp = decoded_data + y * stride;
1137 for (
ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4) {
1139 __m128i vn = _mm_set1_epi32(2);
1141 __m256i inf_u_q = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)sp));
1142 inf_u_q = _mm256_permutevar8x32_epi32(inf_u_q, _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1));
1144 __m256i U_q = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)(vp + v_n_size)));
1145 U_q = _mm256_permutevar8x32_epi32(U_q, _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1));
1147 __m256i row = decode_two_quad32_avx2(inf_u_q, U_q, dbuf, limit, pos, p, vn);
1148 row = _mm256_permutevar8x32_epi32(row, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
1149 _mm_store_si128((__m128i*)dp, _mm256_castsi256_si128(row));
1150 _mm_store_si128((__m128i*)(dp + stride), _mm256_extracti128_si256(row, 0x1));
1152 __m128i w0 = _mm_cvtsi32_si128(*(
int const*)vp);
1153 w0 = _mm_or_si128(w0, vn);
1154 _mm_storeu_si128((__m128i*)vp, w0);
1168 void decode_cb_spp_mrp(
ui16* scratch,
ui32* decoded_data,
ui8* coded_data,
1171 ui32 lengths2,
bool stripe_causal)
1177 ui16*
const sigma = scratch;
1179 ui32 mstr = (width + 3u) >> 2;
1181 mstr = ((mstr + 2u) + 7u) & ~7u;
1189 const __m128i mask_3 = _mm_set1_epi32(0x30);
1190 const __m128i mask_C = _mm_set1_epi32(0xC0);
1191 const __m128i shuffle_mask = _mm_set_epi32(-1, -1, -1, 0x0C080400);
1192 for (y = 0; y < height; y += 4)
1194 ui16* sp = scratch + (y >> 1) * sstr;
1195 ui16* dp = sigma + (y >> 2) * mstr;
1196 for (
ui32 x = 0; x < width; x += 8, sp += 8, dp += 2)
1198 __m128i s0, s1, u3, uC, t0, t1;
1200 s0 = _mm_loadu_si128((__m128i*)(sp));
1201 u3 = _mm_and_si128(s0, mask_3);
1202 u3 = _mm_srli_epi32(u3, 4);
1203 uC = _mm_and_si128(s0, mask_C);
1204 uC = _mm_srli_epi32(uC, 2);
1205 t0 = _mm_or_si128(u3, uC);
1207 s1 = _mm_loadu_si128((__m128i*)(sp + sstr));
1208 u3 = _mm_and_si128(s1, mask_3);
1209 u3 = _mm_srli_epi32(u3, 2);
1210 uC = _mm_and_si128(s1, mask_C);
1211 t1 = _mm_or_si128(u3, uC);
1213 __m128i r = _mm_or_si128(t0, t1);
1214 r = _mm_shuffle_epi8(r, shuffle_mask);
1216 ui32 t = (
ui32)_mm_extract_epi32(r, 0);
1223 ui16* dp = sigma + (y >> 2) * mstr;
1224 __m128i zero = _mm_setzero_si128();
1225 for (
ui32 x = 0; x < width; x += 32, dp += 8)
1226 _mm_storeu_si128((__m128i*)dp, zero);
1242 ui16 prev_row_sig[256 + 8] = {0};
1246 const ui32 spp_cap = 4096 * 2 / 8;
1247 ui8 spp_buf[spp_cap + 72];
1248 ui32 spp_limit = destuff_frwd<0>(coded_data + lengths1,
1249 (
int)lengths2, spp_buf, spp_cap);
1252 for (
ui32 y = 0; y < height; y += 4)
1254 ui32 pattern = 0xFFFFu;
1255 if (height - y < 4) {
1257 if (height - y < 3) {
1267 ui16 *prev_sig = prev_row_sig;
1268 ui16 *cur_sig = sigma + (y >> 2) * mstr;
1269 ui32 *dpp = decoded_data + y * stride;
1270 for (
ui32 x = 0; x < width; x += 4, dpp += 4, ++cur_sig, ++prev_sig)
1275 pattern = pattern >> (s * 4);
1290 memcpy(&ps, prev_sig, 4);
1291 memcpy(&ns, cur_sig + mstr, 4);
1292 ui32 u = (ps & 0x88888888) >> 3;
1294 u |= (ns & 0x11111111) << 3;
1296 memcpy(&cs, cur_sig, 4);
1299 mbr |= (cs & 0x77777777) << 1;
1300 mbr |= (cs & 0xEEEEEEEE) >> 1;
1316 ui64 cwd = dfetch64(spp_buf, spp_limit, spp_pos);
1319 ui32 col_mask = 0xFu;
1320 ui32 inv_sig = ~cs & pattern;
1321 for (
int i = 0; i < 16; i += 4, col_mask <<= 4)
1323 if ((col_mask & new_sig) == 0)
1327 ui32 sample_mask = 0x1111u & col_mask;
1328 if (new_sig & sample_mask)
1330 new_sig &= ~sample_mask;
1333 ui32 t = 0x33u << i;
1334 new_sig |= t & inv_sig;
1340 if (new_sig & sample_mask)
1342 new_sig &= ~sample_mask;
1345 ui32 t = 0x76u << i;
1346 new_sig |= t & inv_sig;
1352 if (new_sig & sample_mask)
1354 new_sig &= ~sample_mask;
1357 ui32 t = 0xECu << i;
1358 new_sig |= t & inv_sig;
1364 if (new_sig & sample_mask)
1366 new_sig &= ~sample_mask;
1369 ui32 t = 0xC8u << i;
1370 new_sig |= t & inv_sig;
1383 __m128i new_sig_vec = _mm_set1_epi16((
si16)new_sig);
1384 new_sig_vec = _mm_shuffle_epi8(new_sig_vec,
1385 _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
1386 new_sig_vec = _mm_and_si128(new_sig_vec,
1387 _mm_set1_epi64x((
si64)0x8040201008040201));
1388 new_sig_vec = _mm_cmpeq_epi8(new_sig_vec,
1389 _mm_set1_epi64x((
si64)0x8040201008040201));
1393 __m128i inc_sum = new_sig_vec;
1394 inc_sum = _mm_abs_epi8(inc_sum);
1395 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 1));
1396 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 2));
1397 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 4));
1398 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 8));
1399 cnt += (
ui32)_mm_extract_epi16(inc_sum, 7) >> 8;
1401 __m128i ex_sum = _mm_bslli_si128(inc_sum, 1);
1405 __m128i cwd_vec = _mm_set1_epi16((
si16)cwd);
1406 cwd_vec = _mm_shuffle_epi8(cwd_vec,
1407 _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
1408 cwd_vec = _mm_and_si128(cwd_vec,
1409 _mm_set1_epi64x((
si64)0x8040201008040201));
1410 cwd_vec = _mm_cmpeq_epi8(cwd_vec,
1411 _mm_set1_epi64x((
si64)0x8040201008040201));
1412 cwd_vec = _mm_abs_epi8(cwd_vec);
1416 __m128i v = _mm_shuffle_epi8(cwd_vec, ex_sum);
1420 _mm_set_epi8(-1,-1,-1,12,-1,-1,-1,8,-1,-1,-1,4,-1,-1,-1,0);
1421 __m128i val = _mm_set1_epi32(3 << (p - 2));
1423 for (
int c = 0; c < 4; ++ c) {
1424 __m128i s0, s0_ns, s0_val;
1426 s0 = _mm_load_si128((__m128i*)dp);
1430 s0_ns = _mm_shuffle_epi8(new_sig_vec, m);
1431 s0_ns = _mm_cmpeq_epi32(s0_ns, _mm_set1_epi32(0xFF));
1434 s0_val = _mm_shuffle_epi8(v, m);
1435 s0_val = _mm_slli_epi32(s0_val, 31);
1436 s0_val = _mm_or_si128(s0_val, val);
1437 s0_val = _mm_and_si128(s0_val, s0_ns);
1440 s0 = _mm_or_si128(s0, s0_val);
1442 _mm_store_si128((__m128i*)dp, s0);
1445 m = _mm_add_epi32(m, _mm_set1_epi32(1));
1452 *prev_sig = (
ui16)(new_sig);
1456 new_sig |= (t & 0x7777) << 1;
1457 new_sig |= (t & 0xEEEE) >> 1;
1471 const ui32 mrp_cap = 1024;
1472 ui8 mrp_buf[mrp_cap + 72];
1473 ui32 mrp_limit = destuff_mrp(coded_data, (
int)lengths1,
1474 (
int)lengths2, mrp_buf, mrp_cap);
1477 for (
ui32 y = 0; y < height; y += 4)
1479 ui16 *cur_sig = sigma + (y >> 2) * mstr;
1480 ui32 *dpp = decoded_data + y * stride;
1481 for (
ui32 i = 0; i < width; i += 4, dpp += 4)
1485 ui64 cwd = dfetch64(mrp_buf, mrp_limit, mrp_pos);
1486 ui16 sig = *cur_sig++;
1494 __m128i sig_vec = _mm_set1_epi16((
si16)sig);
1495 sig_vec = _mm_shuffle_epi8(sig_vec,
1496 _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
1497 sig_vec = _mm_and_si128(sig_vec,
1498 _mm_set1_epi64x((
si64)0x8040201008040201));
1499 sig_vec = _mm_cmpeq_epi8(sig_vec,
1500 _mm_set1_epi64x((
si64)0x8040201008040201));
1501 sig_vec = _mm_abs_epi8(sig_vec);
1505 __m128i inc_sum = sig_vec;
1506 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 1));
1507 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 2));
1508 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 4));
1509 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 8));
1510 total_bits = _mm_extract_epi16(inc_sum, 7) >> 8;
1511 __m128i ex_sum = _mm_bslli_si128(inc_sum, 1);
1518 __m128i cwd_vec = _mm_set1_epi16((
si16)cwd);
1519 cwd_vec = _mm_shuffle_epi8(cwd_vec,
1520 _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
1521 cwd_vec = _mm_and_si128(cwd_vec,
1522 _mm_set1_epi64x((
si64)0x8040201008040201));
1523 cwd_vec = _mm_cmpeq_epi8(cwd_vec,
1524 _mm_set1_epi64x((
si64)0x8040201008040201));
1525 cwd_vec = _mm_add_epi8(cwd_vec, _mm_set1_epi8(1));
1526 cwd_vec = _mm_add_epi8(cwd_vec, cwd_vec);
1527 cwd_vec = _mm_or_si128(cwd_vec, _mm_set1_epi8(1));
1531 _mm_set_epi8(-1,-1,-1,12,-1,-1,-1,8,-1,-1,-1,4,-1,-1,-1,0);
1533 for (
int c = 0; c < 4; ++c) {
1534 __m128i s0, s0_sig, s0_idx, s0_val;
1536 s0 = _mm_load_si128((__m128i*)dp);
1538 s0_sig = _mm_shuffle_epi8(sig_vec, m);
1539 s0_sig = _mm_cmpeq_epi8(s0_sig, _mm_setzero_si128());
1541 s0_idx = _mm_shuffle_epi8(ex_sum, m);
1542 s0_val = _mm_shuffle_epi8(cwd_vec, s0_idx);
1544 s0_val = _mm_andnot_si128(s0_sig, s0_val);
1546 s0_val = _mm_slli_epi32(s0_val, (
si32)p - 2);
1547 s0 = _mm_xor_si128(s0, s0_val);
1549 _mm_store_si128((__m128i*)dp, s0);
1552 m = _mm_add_epi32(m, _mm_set1_epi32(1));
1556 mrp_pos += (
ui32)total_bits;
1569 void decode_cb_step1_vlc(
ui16* scratch,
ui8* coded_data,
int lcup,
1574 mel_init(&mel, coded_data, lcup, scup);
1579 const ui32 vlc_cap = 4096;
1580 ui8 vlc_buf[vlc_cap + 72];
1581 ui32 vlc_limit = destuff_vlc(coded_data, lcup, scup,
1594 for (
ui32 x = 0; x < width; sp += 4)
1600 drefill(vlc_val, vlc_bits, vlc_off, vlc_buf, vlc_limit);
1613 t0 = (run == -1) ? t0 : 0;
1627 c_q = ((t0 & 0x10U) << 3) | ((t0 & 0xE0U) << 2);
1630 dconsume(vlc_val, vlc_bits, t0 & 0x7u);
1636 t1 =
vlc_tbl0[c_q + (vlc_val & 0x7F)];
1639 if (c_q == 0 && x < width)
1644 t1 = (run == -1) ? t1 : 0;
1649 t1 = x < width ? t1 : 0;
1658 c_q = ((t1 & 0x10U) << 3) | ((t1 & 0xE0U) << 2);
1661 dconsume(vlc_val, vlc_bits, t1 & 0x7u);
1666 ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4);
1667 if (uvlc_mode == 0xc0)
1671 uvlc_mode += (run == -1) ? 0x40 : 0;
1685 dconsume(vlc_val, vlc_bits, uvlc_entry & 0x7u);
1688 ui32 len = uvlc_entry & 0xF;
1689 ui32 tmp = (
ui32)vlc_val & ((1u << len) - 1);
1690 dconsume(vlc_val, vlc_bits, len);
1693 len = uvlc_entry & 0x7;
1695 ui16 u_q = (
ui16)(1 + (uvlc_entry&7) + (tmp&~(0xFFU<<len)));
1697 u_q = (
ui16)(1 + (uvlc_entry >> 3) + (tmp >> len));
1703 for (
ui32 y = 2; y < height; y += 2)
1706 ui16 *sp = scratch + (y >> 1) * sstr;
1708 for (
ui32 x = 0; x < width; sp += 4)
1714 c_q |= ((sp[0 - (
si32)sstr] & 0xA0U) << 2);
1715 c_q |= ((sp[2 - (
si32)sstr] & 0x20U) << 4);
1718 drefill(vlc_val, vlc_bits, vlc_off, vlc_buf, vlc_limit);
1731 t0 = (run == -1) ? t0 : 0;
1746 c_q = ((t0 & 0x40U) << 2) | ((t0 & 0x80U) << 1);
1748 c_q |= sp[0 - (
si32)sstr] & 0x80;
1750 c_q |= ((sp[2 - (
si32)sstr] & 0xA0U) << 2);
1751 c_q |= ((sp[4 - (
si32)sstr] & 0x20U) << 4);
1754 dconsume(vlc_val, vlc_bits, t0 & 0x7u);
1760 t1 =
vlc_tbl1[ c_q + (vlc_val & 0x7F)];
1763 if (c_q == 0 && x < width)
1768 t1 = (run == -1) ? t1 : 0;
1773 t1 = x < width ? t1 : 0;
1783 c_q = ((t1 & 0x40U) << 2) | ((t1 & 0x80U) << 1);
1785 c_q |= sp[2 - (
si32)sstr] & 0x80;
1788 dconsume(vlc_val, vlc_bits, t1 & 0x7u);
1792 ui32 uvlc_mode = (((t0 >> 3) & 1) | (((t1 >> 3) & 1) << 1));
1795 ui32 total_bits = uvlc_entry & 0x1F;
1796 if (total_bits < 0x1F) {
1797 sp[1] = (
ui16)((uvlc_entry >> 5) & 0xFF);
1798 sp[3] = (
ui16)((uvlc_entry >> 13) & 0xFF);
1799 dconsume(vlc_val, vlc_bits, total_bits);
1801 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4);
1802 uvlc_entry =
uvlc_tbl1[uvlc_mode + (vlc_val & 0x3F)];
1803 dconsume(vlc_val, vlc_bits, uvlc_entry & 0x7u);
1805 ui32 len = uvlc_entry & 0xF;
1806 ui32 tmp = (
ui32)vlc_val & ((1u << len) - 1);
1807 dconsume(vlc_val, vlc_bits, len);
1809 len = uvlc_entry & 0x7;
1811 sp[1] = (
ui16)((uvlc_entry & 7) + (tmp & ~(0xFFU << len)));
1812 sp[3] = (
ui16)((uvlc_entry >> 3) + (tmp >> len));
1820 ui32 missing_msbs,
ui32 num_passes,
1825 static bool insufficient_precision =
false;
1826 static bool modify_code =
false;
1827 static bool truncate_spp_mrp =
false;
1829 if (num_passes > 1 && lengths2 == 0)
1831 OJPH_WARN(0x00010001,
"A malformed codeblock that has more than "
1832 "one coding pass, but zero length for "
1833 "2nd and potential 3rd pass.");
1839 OJPH_WARN(0x00010002,
"We do not support more than 3 coding passes; "
1840 "This codeblocks has %d passes.",
1845 if (missing_msbs > 30)
1847 if (insufficient_precision ==
false)
1849 insufficient_precision =
true;
1850 OJPH_WARN(0x00010003,
"32 bits are not enough to decode this "
1851 "codeblock. This message will not be "
1852 "displayed again.");
1856 else if (missing_msbs == 30)
1858 if (modify_code ==
false) {
1860 OJPH_WARN(0x00010004,
"Not enough precision to decode the cleanup "
1861 "pass. The code can be modified to support "
1862 "this case. This message will not be "
1863 "displayed again.");
1867 else if (missing_msbs == 29)
1869 if (num_passes > 1) {
1871 if (truncate_spp_mrp ==
false) {
1872 truncate_spp_mrp =
true;
1873 OJPH_WARN(0x00010005,
"Not enough precision to decode the SgnProp "
1874 "nor MagRef passes; both will be skipped. "
1875 "This message will not be displayed "
1880 ui32 p = 30 - missing_msbs;
1886 OJPH_WARN(0x00010006,
"Wrong codeblock length.");
1892 lcup = (int)lengths1;
1894 scup = (((int)coded_data[lcup-1]) << 4) + (coded_data[lcup-2] & 0xF);
1895 if (scup < 2 || scup > lcup || scup > 4079)
1919 ui32 sstr = ((width + 2u) + 7u) & ~7u;
1922 ui16 scratch[8 * 513] = {0};
1924 ui16 scratch[8 * 513];
1925 ui32 quad_rows = (height + 1u) >> 1;
1926 size_t scratch_zero = (size_t)(quad_rows + 1) * sstr;
1927 if (scratch_zero > 8 * 513) scratch_zero = 8 * 513;
1928 memset(scratch, 0, scratch_zero *
sizeof(
ui16));
1931 assert((stride & 0x3) == 0);
1933 ui32 mmsbp2 = missing_msbs + 2;
1942 decode_cb_step1_vlc(scratch, coded_data, lcup, scup, width, height, sstr);
1954 if (!decode_cb_step2_32bit(scratch, decoded_data, coded_data,
1955 width, height, stride, sstr, p, mmsbp2,
1960 if (!decode_cb_step2_16bit(scratch, decoded_data, coded_data,
1961 width, height, stride, sstr, p, mmsbp2,
1967 decode_cb_spp_mrp(scratch, decoded_data, coded_data, width, height,
1968 stride, sstr, p, num_passes, lengths1, lengths2,
ui32 uvlc_tbl1_wide[4096]
uvlc_tbl1_wide: wider UVLC table for non-initial rows. Index = mode(2 bits) * 1024 + vlc_data(10 bits...
ui16 uvlc_tbl0[256+64]
uvlc_tbl0 contains decoding information for initial row of quads
ui16 uvlc_tbl1[256]
uvlc_tbl1 contains decoding information for non-initial row of quads
ui16 vlc_tbl1[1024]
vlc_tbl1 contains decoding information for non-initial row of quads
ui16 vlc_tbl0[1024]
vlc_tbl0 contains decoding information for initial row of quads
static void mel_read(dec_mel_st *melp)
Reads and unstuffs the MEL bitstream.
static int mel_get_run(dec_mel_st *melp)
Retrieves one run from dec_mel_st; if there are no runs stored MEL segment is decoded.
static void mel_init(dec_mel_st *melp, ui8 *bbuf, int lcup, int scup)
Initiates a dec_mel_st structure for MEL decoding and reads some bytes in order to get the read addre...
bool ojph_decode_codeblock_avx2(ui8 *coded_data, ui32 *decoded_data, ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2, ui32 width, ui32 height, ui32 stride, bool stripe_causal)
static void mel_decode(dec_mel_st *melp)
Decodes unstuffed MEL segment bits stored in tmp to runs.
#define OJPH_FORCE_INLINE
MEL state structure for reading and decoding the MEL bitstream.
bool unstuff
true if the next bit needs to be unstuffed
int num_runs
number of decoded runs left in runs (maximum 8)
int size
number of bytes in MEL code
ui8 * data
the address of data (or bitstream)
int k
state of MEL decoder
int bits
number of bits stored in tmp
ui64 tmp
temporary buffer for read data
ui64 runs
runs of decoded MEL codewords (7 bits/run)