GNU Linux-libre 4.19.264-gnu1
[releases.git] / drivers / media / platform / vicodec / vicodec-codec.c
1 // SPDX-License-Identifier: GPL-2.0+
2 /*
3  * Copyright 2016 Tom aan de Wiel
4  * Copyright 2018 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
5  *
6  * 8x8 Fast Walsh Hadamard Transform in sequency order based on the paper:
7  *
8  * A Recursive Algorithm for Sequency-Ordered Fast Walsh Transforms,
9  * R.D. Brown, 1977
10  */
11
12 #include <linux/string.h>
13 #include "vicodec-codec.h"
14
15 #define ALL_ZEROS 15
16 #define DEADZONE_WIDTH 20
17
18 static const uint8_t zigzag[64] = {
19         0,
20         1,  8,
21         2,  9, 16,
22         3, 10, 17, 24,
23         4, 11, 18, 25, 32,
24         5, 12, 19, 26, 33, 40,
25         6, 13, 20, 27, 34, 41, 48,
26         7, 14, 21, 28, 35, 42, 49, 56,
27         15, 22, 29, 36, 43, 50, 57,
28         23, 30, 37, 44, 51, 58,
29         31, 38, 45, 52, 59,
30         39, 46, 53, 60,
31         47, 54, 61,
32         55, 62,
33         63,
34 };
35
36
37 static int rlc(const s16 *in, __be16 *output, int blocktype)
38 {
39         s16 block[8 * 8];
40         s16 *wp = block;
41         int i = 0;
42         int x, y;
43         int ret = 0;
44
45         /* read in block from framebuffer */
46         int lastzero_run = 0;
47         int to_encode;
48
49         for (y = 0; y < 8; y++) {
50                 for (x = 0; x < 8; x++) {
51                         *wp = in[x + y * 8];
52                         wp++;
53                 }
54         }
55
56         /* keep track of amount of trailing zeros */
57         for (i = 63; i >= 0 && !block[zigzag[i]]; i--)
58                 lastzero_run++;
59
60         *output++ = (blocktype == PBLOCK ? htons(PFRAME_BIT) : 0);
61         ret++;
62
63         to_encode = 8 * 8 - (lastzero_run > 14 ? lastzero_run : 0);
64
65         i = 0;
66         while (i < to_encode) {
67                 int cnt = 0;
68                 int tmp;
69
70                 /* count leading zeros */
71                 while ((tmp = block[zigzag[i]]) == 0 && cnt < 14) {
72                         cnt++;
73                         i++;
74                         if (i == to_encode) {
75                                 cnt--;
76                                 break;
77                         }
78                 }
79                 /* 4 bits for run, 12 for coefficient (quantization by 4) */
80                 *output++ = htons((cnt | tmp << 4));
81                 i++;
82                 ret++;
83         }
84         if (lastzero_run > 14) {
85                 *output = htons(ALL_ZEROS | 0);
86                 ret++;
87         }
88
89         return ret;
90 }
91
92 /*
93  * This function will worst-case increase rlc_in by 65*2 bytes:
94  * one s16 value for the header and 8 * 8 coefficients of type s16.
95  */
96 static s16 derlc(const __be16 **rlc_in, s16 *dwht_out)
97 {
98         /* header */
99         const __be16 *input = *rlc_in;
100         s16 ret = ntohs(*input++);
101         int dec_count = 0;
102         s16 block[8 * 8 + 16];
103         s16 *wp = block;
104         int i;
105
106         /*
107          * Now de-compress, it expands one byte to up to 15 bytes
108          * (or fills the remainder of the 64 bytes with zeroes if it
109          * is the last byte to expand).
110          *
111          * So block has to be 8 * 8 + 16 bytes, the '+ 16' is to
112          * allow for overflow if the incoming data was malformed.
113          */
114         while (dec_count < 8 * 8) {
115                 s16 in = ntohs(*input++);
116                 int length = in & 0xf;
117                 int coeff = in >> 4;
118
119                 /* fill remainder with zeros */
120                 if (length == 15) {
121                         for (i = 0; i < 64 - dec_count; i++)
122                                 *wp++ = 0;
123                         break;
124                 }
125
126                 for (i = 0; i < length; i++)
127                         *wp++ = 0;
128                 *wp++ = coeff;
129                 dec_count += length + 1;
130         }
131
132         wp = block;
133
134         for (i = 0; i < 64; i++) {
135                 int pos = zigzag[i];
136                 int y = pos / 8;
137                 int x = pos % 8;
138
139                 dwht_out[x + y * 8] = *wp++;
140         }
141         *rlc_in = input;
142         return ret;
143 }
144
145 static const int quant_table[] = {
146         2, 2, 2, 2, 2, 2,  2,  2,
147         2, 2, 2, 2, 2, 2,  2,  2,
148         2, 2, 2, 2, 2, 2,  2,  3,
149         2, 2, 2, 2, 2, 2,  3,  6,
150         2, 2, 2, 2, 2, 3,  6,  6,
151         2, 2, 2, 2, 3, 6,  6,  6,
152         2, 2, 2, 3, 6, 6,  6,  6,
153         2, 2, 3, 6, 6, 6,  6,  8,
154 };
155
156 static const int quant_table_p[] = {
157         3, 3, 3, 3, 3, 3,  3,  3,
158         3, 3, 3, 3, 3, 3,  3,  3,
159         3, 3, 3, 3, 3, 3,  3,  3,
160         3, 3, 3, 3, 3, 3,  3,  6,
161         3, 3, 3, 3, 3, 3,  6,  6,
162         3, 3, 3, 3, 3, 6,  6,  9,
163         3, 3, 3, 3, 6, 6,  9,  9,
164         3, 3, 3, 6, 6, 9,  9,  10,
165 };
166
167 static void quantize_intra(s16 *coeff, s16 *de_coeff)
168 {
169         const int *quant = quant_table;
170         int i, j;
171
172         for (j = 0; j < 8; j++) {
173                 for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) {
174                         *coeff >>= *quant;
175                         if (*coeff >= -DEADZONE_WIDTH &&
176                             *coeff <= DEADZONE_WIDTH)
177                                 *coeff = *de_coeff = 0;
178                         else
179                                 *de_coeff = *coeff << *quant;
180                 }
181         }
182 }
183
184 static void dequantize_intra(s16 *coeff)
185 {
186         const int *quant = quant_table;
187         int i, j;
188
189         for (j = 0; j < 8; j++)
190                 for (i = 0; i < 8; i++, quant++, coeff++)
191                         *coeff <<= *quant;
192 }
193
194 static void quantize_inter(s16 *coeff, s16 *de_coeff)
195 {
196         const int *quant = quant_table_p;
197         int i, j;
198
199         for (j = 0; j < 8; j++) {
200                 for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) {
201                         *coeff >>= *quant;
202                         if (*coeff >= -DEADZONE_WIDTH &&
203                             *coeff <= DEADZONE_WIDTH)
204                                 *coeff = *de_coeff = 0;
205                         else
206                                 *de_coeff = *coeff << *quant;
207                 }
208         }
209 }
210
211 static void dequantize_inter(s16 *coeff)
212 {
213         const int *quant = quant_table_p;
214         int i, j;
215
216         for (j = 0; j < 8; j++)
217                 for (i = 0; i < 8; i++, quant++, coeff++)
218                         *coeff <<= *quant;
219 }
220
221 static void fwht(const u8 *block, s16 *output_block, unsigned int stride,
222                  unsigned int input_step, bool intra)
223 {
224         /* we'll need more than 8 bits for the transformed coefficients */
225         s32 workspace1[8], workspace2[8];
226         const u8 *tmp = block;
227         s16 *out = output_block;
228         int add = intra ? 256 : 0;
229         unsigned int i;
230
231         /* stage 1 */
232         stride *= input_step;
233
234         for (i = 0; i < 8; i++, tmp += stride, out += 8) {
235                 if (input_step == 1) {
236                         workspace1[0]  = tmp[0] + tmp[1] - add;
237                         workspace1[1]  = tmp[0] - tmp[1];
238
239                         workspace1[2]  = tmp[2] + tmp[3] - add;
240                         workspace1[3]  = tmp[2] - tmp[3];
241
242                         workspace1[4]  = tmp[4] + tmp[5] - add;
243                         workspace1[5]  = tmp[4] - tmp[5];
244
245                         workspace1[6]  = tmp[6] + tmp[7] - add;
246                         workspace1[7]  = tmp[6] - tmp[7];
247                 } else {
248                         workspace1[0]  = tmp[0] + tmp[2] - add;
249                         workspace1[1]  = tmp[0] - tmp[2];
250
251                         workspace1[2]  = tmp[4] + tmp[6] - add;
252                         workspace1[3]  = tmp[4] - tmp[6];
253
254                         workspace1[4]  = tmp[8] + tmp[10] - add;
255                         workspace1[5]  = tmp[8] - tmp[10];
256
257                         workspace1[6]  = tmp[12] + tmp[14] - add;
258                         workspace1[7]  = tmp[12] - tmp[14];
259                 }
260
261                 /* stage 2 */
262                 workspace2[0] = workspace1[0] + workspace1[2];
263                 workspace2[1] = workspace1[0] - workspace1[2];
264                 workspace2[2] = workspace1[1] - workspace1[3];
265                 workspace2[3] = workspace1[1] + workspace1[3];
266
267                 workspace2[4] = workspace1[4] + workspace1[6];
268                 workspace2[5] = workspace1[4] - workspace1[6];
269                 workspace2[6] = workspace1[5] - workspace1[7];
270                 workspace2[7] = workspace1[5] + workspace1[7];
271
272                 /* stage 3 */
273                 out[0] = workspace2[0] + workspace2[4];
274                 out[1] = workspace2[0] - workspace2[4];
275                 out[2] = workspace2[1] - workspace2[5];
276                 out[3] = workspace2[1] + workspace2[5];
277                 out[4] = workspace2[2] + workspace2[6];
278                 out[5] = workspace2[2] - workspace2[6];
279                 out[6] = workspace2[3] - workspace2[7];
280                 out[7] = workspace2[3] + workspace2[7];
281         }
282
283         out = output_block;
284
285         for (i = 0; i < 8; i++, out++) {
286                 /* stage 1 */
287                 workspace1[0]  = out[0] + out[1 * 8];
288                 workspace1[1]  = out[0] - out[1 * 8];
289
290                 workspace1[2]  = out[2 * 8] + out[3 * 8];
291                 workspace1[3]  = out[2 * 8] - out[3 * 8];
292
293                 workspace1[4]  = out[4 * 8] + out[5 * 8];
294                 workspace1[5]  = out[4 * 8] - out[5 * 8];
295
296                 workspace1[6]  = out[6 * 8] + out[7 * 8];
297                 workspace1[7]  = out[6 * 8] - out[7 * 8];
298
299                 /* stage 2 */
300                 workspace2[0] = workspace1[0] + workspace1[2];
301                 workspace2[1] = workspace1[0] - workspace1[2];
302                 workspace2[2] = workspace1[1] - workspace1[3];
303                 workspace2[3] = workspace1[1] + workspace1[3];
304
305                 workspace2[4] = workspace1[4] + workspace1[6];
306                 workspace2[5] = workspace1[4] - workspace1[6];
307                 workspace2[6] = workspace1[5] - workspace1[7];
308                 workspace2[7] = workspace1[5] + workspace1[7];
309                 /* stage 3 */
310                 out[0 * 8] = workspace2[0] + workspace2[4];
311                 out[1 * 8] = workspace2[0] - workspace2[4];
312                 out[2 * 8] = workspace2[1] - workspace2[5];
313                 out[3 * 8] = workspace2[1] + workspace2[5];
314                 out[4 * 8] = workspace2[2] + workspace2[6];
315                 out[5 * 8] = workspace2[2] - workspace2[6];
316                 out[6 * 8] = workspace2[3] - workspace2[7];
317                 out[7 * 8] = workspace2[3] + workspace2[7];
318         }
319 }
320
321 /*
322  * Not the nicest way of doing it, but P-blocks get twice the range of
323  * that of the I-blocks. Therefore we need a type bigger than 8 bits.
324  * Furthermore values can be negative... This is just a version that
325  * works with 16 signed data
326  */
327 static void fwht16(const s16 *block, s16 *output_block, int stride, int intra)
328 {
329         /* we'll need more than 8 bits for the transformed coefficients */
330         s32 workspace1[8], workspace2[8];
331         const s16 *tmp = block;
332         s16 *out = output_block;
333         int i;
334
335         for (i = 0; i < 8; i++, tmp += stride, out += 8) {
336                 /* stage 1 */
337                 workspace1[0]  = tmp[0] + tmp[1];
338                 workspace1[1]  = tmp[0] - tmp[1];
339
340                 workspace1[2]  = tmp[2] + tmp[3];
341                 workspace1[3]  = tmp[2] - tmp[3];
342
343                 workspace1[4]  = tmp[4] + tmp[5];
344                 workspace1[5]  = tmp[4] - tmp[5];
345
346                 workspace1[6]  = tmp[6] + tmp[7];
347                 workspace1[7]  = tmp[6] - tmp[7];
348
349                 /* stage 2 */
350                 workspace2[0] = workspace1[0] + workspace1[2];
351                 workspace2[1] = workspace1[0] - workspace1[2];
352                 workspace2[2] = workspace1[1] - workspace1[3];
353                 workspace2[3] = workspace1[1] + workspace1[3];
354
355                 workspace2[4] = workspace1[4] + workspace1[6];
356                 workspace2[5] = workspace1[4] - workspace1[6];
357                 workspace2[6] = workspace1[5] - workspace1[7];
358                 workspace2[7] = workspace1[5] + workspace1[7];
359
360                 /* stage 3 */
361                 out[0] = workspace2[0] + workspace2[4];
362                 out[1] = workspace2[0] - workspace2[4];
363                 out[2] = workspace2[1] - workspace2[5];
364                 out[3] = workspace2[1] + workspace2[5];
365                 out[4] = workspace2[2] + workspace2[6];
366                 out[5] = workspace2[2] - workspace2[6];
367                 out[6] = workspace2[3] - workspace2[7];
368                 out[7] = workspace2[3] + workspace2[7];
369         }
370
371         out = output_block;
372
373         for (i = 0; i < 8; i++, out++) {
374                 /* stage 1 */
375                 workspace1[0]  = out[0] + out[1*8];
376                 workspace1[1]  = out[0] - out[1*8];
377
378                 workspace1[2]  = out[2*8] + out[3*8];
379                 workspace1[3]  = out[2*8] - out[3*8];
380
381                 workspace1[4]  = out[4*8] + out[5*8];
382                 workspace1[5]  = out[4*8] - out[5*8];
383
384                 workspace1[6]  = out[6*8] + out[7*8];
385                 workspace1[7]  = out[6*8] - out[7*8];
386
387                 /* stage 2 */
388                 workspace2[0] = workspace1[0] + workspace1[2];
389                 workspace2[1] = workspace1[0] - workspace1[2];
390                 workspace2[2] = workspace1[1] - workspace1[3];
391                 workspace2[3] = workspace1[1] + workspace1[3];
392
393                 workspace2[4] = workspace1[4] + workspace1[6];
394                 workspace2[5] = workspace1[4] - workspace1[6];
395                 workspace2[6] = workspace1[5] - workspace1[7];
396                 workspace2[7] = workspace1[5] + workspace1[7];
397
398                 /* stage 3 */
399                 out[0*8] = workspace2[0] + workspace2[4];
400                 out[1*8] = workspace2[0] - workspace2[4];
401                 out[2*8] = workspace2[1] - workspace2[5];
402                 out[3*8] = workspace2[1] + workspace2[5];
403                 out[4*8] = workspace2[2] + workspace2[6];
404                 out[5*8] = workspace2[2] - workspace2[6];
405                 out[6*8] = workspace2[3] - workspace2[7];
406                 out[7*8] = workspace2[3] + workspace2[7];
407         }
408 }
409
410 static void ifwht(const s16 *block, s16 *output_block, int intra)
411 {
412         /*
413          * we'll need more than 8 bits for the transformed coefficients
414          * use native unit of cpu
415          */
416         int workspace1[8], workspace2[8];
417         int inter = intra ? 0 : 1;
418         const s16 *tmp = block;
419         s16 *out = output_block;
420         int i;
421
422         for (i = 0; i < 8; i++, tmp += 8, out += 8) {
423                 /* stage 1 */
424                 workspace1[0]  = tmp[0] + tmp[1];
425                 workspace1[1]  = tmp[0] - tmp[1];
426
427                 workspace1[2]  = tmp[2] + tmp[3];
428                 workspace1[3]  = tmp[2] - tmp[3];
429
430                 workspace1[4]  = tmp[4] + tmp[5];
431                 workspace1[5]  = tmp[4] - tmp[5];
432
433                 workspace1[6]  = tmp[6] + tmp[7];
434                 workspace1[7]  = tmp[6] - tmp[7];
435
436                 /* stage 2 */
437                 workspace2[0] = workspace1[0] + workspace1[2];
438                 workspace2[1] = workspace1[0] - workspace1[2];
439                 workspace2[2] = workspace1[1] - workspace1[3];
440                 workspace2[3] = workspace1[1] + workspace1[3];
441
442                 workspace2[4] = workspace1[4] + workspace1[6];
443                 workspace2[5] = workspace1[4] - workspace1[6];
444                 workspace2[6] = workspace1[5] - workspace1[7];
445                 workspace2[7] = workspace1[5] + workspace1[7];
446
447                 /* stage 3 */
448                 out[0] = workspace2[0] + workspace2[4];
449                 out[1] = workspace2[0] - workspace2[4];
450                 out[2] = workspace2[1] - workspace2[5];
451                 out[3] = workspace2[1] + workspace2[5];
452                 out[4] = workspace2[2] + workspace2[6];
453                 out[5] = workspace2[2] - workspace2[6];
454                 out[6] = workspace2[3] - workspace2[7];
455                 out[7] = workspace2[3] + workspace2[7];
456         }
457
458         out = output_block;
459
460         for (i = 0; i < 8; i++, out++) {
461                 /* stage 1 */
462                 workspace1[0]  = out[0] + out[1 * 8];
463                 workspace1[1]  = out[0] - out[1 * 8];
464
465                 workspace1[2]  = out[2 * 8] + out[3 * 8];
466                 workspace1[3]  = out[2 * 8] - out[3 * 8];
467
468                 workspace1[4]  = out[4 * 8] + out[5 * 8];
469                 workspace1[5]  = out[4 * 8] - out[5 * 8];
470
471                 workspace1[6]  = out[6 * 8] + out[7 * 8];
472                 workspace1[7]  = out[6 * 8] - out[7 * 8];
473
474                 /* stage 2 */
475                 workspace2[0] = workspace1[0] + workspace1[2];
476                 workspace2[1] = workspace1[0] - workspace1[2];
477                 workspace2[2] = workspace1[1] - workspace1[3];
478                 workspace2[3] = workspace1[1] + workspace1[3];
479
480                 workspace2[4] = workspace1[4] + workspace1[6];
481                 workspace2[5] = workspace1[4] - workspace1[6];
482                 workspace2[6] = workspace1[5] - workspace1[7];
483                 workspace2[7] = workspace1[5] + workspace1[7];
484
485                 /* stage 3 */
486                 if (inter) {
487                         int d;
488
489                         out[0 * 8] = workspace2[0] + workspace2[4];
490                         out[1 * 8] = workspace2[0] - workspace2[4];
491                         out[2 * 8] = workspace2[1] - workspace2[5];
492                         out[3 * 8] = workspace2[1] + workspace2[5];
493                         out[4 * 8] = workspace2[2] + workspace2[6];
494                         out[5 * 8] = workspace2[2] - workspace2[6];
495                         out[6 * 8] = workspace2[3] - workspace2[7];
496                         out[7 * 8] = workspace2[3] + workspace2[7];
497
498                         for (d = 0; d < 8; d++)
499                                 out[8 * d] >>= 6;
500                 } else {
501                         int d;
502
503                         out[0 * 8] = workspace2[0] + workspace2[4];
504                         out[1 * 8] = workspace2[0] - workspace2[4];
505                         out[2 * 8] = workspace2[1] - workspace2[5];
506                         out[3 * 8] = workspace2[1] + workspace2[5];
507                         out[4 * 8] = workspace2[2] + workspace2[6];
508                         out[5 * 8] = workspace2[2] - workspace2[6];
509                         out[6 * 8] = workspace2[3] - workspace2[7];
510                         out[7 * 8] = workspace2[3] + workspace2[7];
511
512                         for (d = 0; d < 8; d++) {
513                                 out[8 * d] >>= 6;
514                                 out[8 * d] += 128;
515                         }
516                 }
517         }
518 }
519
520 static void fill_encoder_block(const u8 *input, s16 *dst,
521                                unsigned int stride, unsigned int input_step)
522 {
523         int i, j;
524
525         for (i = 0; i < 8; i++) {
526                 for (j = 0; j < 8; j++, input += input_step)
527                         *dst++ = *input;
528                 input += (stride - 8) * input_step;
529         }
530 }
531
532 static int var_intra(const s16 *input)
533 {
534         int32_t mean = 0;
535         int32_t ret = 0;
536         const s16 *tmp = input;
537         int i;
538
539         for (i = 0; i < 8 * 8; i++, tmp++)
540                 mean += *tmp;
541         mean /= 64;
542         tmp = input;
543         for (i = 0; i < 8 * 8; i++, tmp++)
544                 ret += (*tmp - mean) < 0 ? -(*tmp - mean) : (*tmp - mean);
545         return ret;
546 }
547
548 static int var_inter(const s16 *old, const s16 *new)
549 {
550         int32_t ret = 0;
551         int i;
552
553         for (i = 0; i < 8 * 8; i++, old++, new++)
554                 ret += (*old - *new) < 0 ? -(*old - *new) : (*old - *new);
555         return ret;
556 }
557
558 static int decide_blocktype(const u8 *cur, const u8 *reference,
559                             s16 *deltablock, unsigned int stride,
560                             unsigned int input_step)
561 {
562         s16 tmp[64];
563         s16 old[64];
564         s16 *work = tmp;
565         unsigned int k, l;
566         int vari;
567         int vard;
568
569         fill_encoder_block(cur, tmp, stride, input_step);
570         fill_encoder_block(reference, old, 8, 1);
571         vari = var_intra(tmp);
572
573         for (k = 0; k < 8; k++) {
574                 for (l = 0; l < 8; l++) {
575                         *deltablock = *work - *reference;
576                         deltablock++;
577                         work++;
578                         reference++;
579                 }
580         }
581         deltablock -= 64;
582         vard = var_inter(old, tmp);
583         return vari <= vard ? IBLOCK : PBLOCK;
584 }
585
586 static void fill_decoder_block(u8 *dst, const s16 *input, int stride)
587 {
588         int i, j;
589
590         for (i = 0; i < 8; i++) {
591                 for (j = 0; j < 8; j++, input++, dst++) {
592                         if (*input < 0)
593                                 *dst = 0;
594                         else if (*input > 255)
595                                 *dst = 255;
596                         else
597                                 *dst = *input;
598                 }
599                 dst += stride - 8;
600         }
601 }
602
603 static void add_deltas(s16 *deltas, const u8 *ref, int stride)
604 {
605         int k, l;
606
607         for (k = 0; k < 8; k++) {
608                 for (l = 0; l < 8; l++) {
609                         *deltas += *ref++;
610                         /*
611                          * Due to quantizing, it might possible that the
612                          * decoded coefficients are slightly out of range
613                          */
614                         if (*deltas < 0)
615                                 *deltas = 0;
616                         else if (*deltas > 255)
617                                 *deltas = 255;
618                         deltas++;
619                 }
620                 ref += stride - 8;
621         }
622 }
623
624 static u32 encode_plane(u8 *input, u8 *refp, __be16 **rlco, __be16 *rlco_max,
625                         struct cframe *cf, u32 height, u32 width,
626                         unsigned int input_step,
627                         bool is_intra, bool next_is_intra)
628 {
629         u8 *input_start = input;
630         __be16 *rlco_start = *rlco;
631         s16 deltablock[64];
632         __be16 pframe_bit = htons(PFRAME_BIT);
633         u32 encoding = 0;
634         unsigned int last_size = 0;
635         unsigned int i, j;
636
637         for (j = 0; j < height / 8; j++) {
638                 for (i = 0; i < width / 8; i++) {
639                         /* intra code, first frame is always intra coded. */
640                         int blocktype = IBLOCK;
641                         unsigned int size;
642
643                         if (!is_intra)
644                                 blocktype = decide_blocktype(input, refp,
645                                         deltablock, width, input_step);
646                         if (is_intra || blocktype == IBLOCK) {
647                                 fwht(input, cf->coeffs, width, input_step, 1);
648                                 quantize_intra(cf->coeffs, cf->de_coeffs);
649                                 blocktype = IBLOCK;
650                         } else {
651                                 /* inter code */
652                                 encoding |= FRAME_PCODED;
653                                 fwht16(deltablock, cf->coeffs, 8, 0);
654                                 quantize_inter(cf->coeffs, cf->de_coeffs);
655                         }
656                         if (!next_is_intra) {
657                                 ifwht(cf->de_coeffs, cf->de_fwht, blocktype);
658
659                                 if (blocktype == PBLOCK)
660                                         add_deltas(cf->de_fwht, refp, 8);
661                                 fill_decoder_block(refp, cf->de_fwht, 8);
662                         }
663
664                         input += 8 * input_step;
665                         refp += 8 * 8;
666
667                         if (encoding & FRAME_UNENCODED)
668                                 continue;
669
670                         size = rlc(cf->coeffs, *rlco, blocktype);
671                         if (last_size == size &&
672                             !memcmp(*rlco + 1, *rlco - size + 1, 2 * size - 2)) {
673                                 __be16 *last_rlco = *rlco - size;
674                                 s16 hdr = ntohs(*last_rlco);
675
676                                 if (!((*last_rlco ^ **rlco) & pframe_bit) &&
677                                     (hdr & DUPS_MASK) < DUPS_MASK)
678                                         *last_rlco = htons(hdr + 2);
679                                 else
680                                         *rlco += size;
681                         } else {
682                                 *rlco += size;
683                         }
684                         if (*rlco >= rlco_max)
685                                 encoding |= FRAME_UNENCODED;
686                         last_size = size;
687                 }
688                 input += width * 7 * input_step;
689         }
690         if (encoding & FRAME_UNENCODED) {
691                 u8 *out = (u8 *)rlco_start;
692
693                 input = input_start;
694                 /*
695                  * The compressed stream should never contain the magic
696                  * header, so when we copy the YUV data we replace 0xff
697                  * by 0xfe. Since YUV is limited range such values
698                  * shouldn't appear anyway.
699                  */
700                 for (i = 0; i < height * width; i++, input += input_step)
701                         *out++ = (*input == 0xff) ? 0xfe : *input;
702                 *rlco = (__be16 *)out;
703         }
704         return encoding;
705 }
706
707 u32 encode_frame(struct raw_frame *frm, struct raw_frame *ref_frm,
708                  struct cframe *cf, bool is_intra, bool next_is_intra)
709 {
710         unsigned int size = frm->height * frm->width;
711         __be16 *rlco = cf->rlc_data;
712         __be16 *rlco_max;
713         u32 encoding;
714
715         rlco_max = rlco + size / 2 - 256;
716         encoding = encode_plane(frm->luma, ref_frm->luma, &rlco, rlco_max, cf,
717                                   frm->height, frm->width,
718                                   1, is_intra, next_is_intra);
719         if (encoding & FRAME_UNENCODED)
720                 encoding |= LUMA_UNENCODED;
721         encoding &= ~FRAME_UNENCODED;
722         rlco_max = rlco + size / 8 - 256;
723         encoding |= encode_plane(frm->cb, ref_frm->cb, &rlco, rlco_max, cf,
724                                    frm->height / 2, frm->width / 2,
725                                    frm->chroma_step, is_intra, next_is_intra);
726         if (encoding & FRAME_UNENCODED)
727                 encoding |= CB_UNENCODED;
728         encoding &= ~FRAME_UNENCODED;
729         rlco_max = rlco + size / 8 - 256;
730         encoding |= encode_plane(frm->cr, ref_frm->cr, &rlco, rlco_max, cf,
731                                    frm->height / 2, frm->width / 2,
732                                    frm->chroma_step, is_intra, next_is_intra);
733         if (encoding & FRAME_UNENCODED)
734                 encoding |= CR_UNENCODED;
735         encoding &= ~FRAME_UNENCODED;
736         cf->size = (rlco - cf->rlc_data) * sizeof(*rlco);
737         return encoding;
738 }
739
740 static void decode_plane(struct cframe *cf, const __be16 **rlco, u8 *ref,
741                          u32 height, u32 width, bool uncompressed)
742 {
743         unsigned int copies = 0;
744         s16 copy[8 * 8];
745         s16 stat;
746         unsigned int i, j;
747
748         if (uncompressed) {
749                 memcpy(ref, *rlco, width * height);
750                 *rlco += width * height / 2;
751                 return;
752         }
753
754         /*
755          * When decoding each macroblock the rlco pointer will be increased
756          * by 65 * 2 bytes worst-case.
757          * To avoid overflow the buffer has to be 65/64th of the actual raw
758          * image size, just in case someone feeds it malicious data.
759          */
760         for (j = 0; j < height / 8; j++) {
761                 for (i = 0; i < width / 8; i++) {
762                         u8 *refp = ref + j * 8 * width + i * 8;
763
764                         if (copies) {
765                                 memcpy(cf->de_fwht, copy, sizeof(copy));
766                                 if (stat & PFRAME_BIT)
767                                         add_deltas(cf->de_fwht, refp, width);
768                                 fill_decoder_block(refp, cf->de_fwht, width);
769                                 copies--;
770                                 continue;
771                         }
772
773                         stat = derlc(rlco, cf->coeffs);
774
775                         if (stat & PFRAME_BIT)
776                                 dequantize_inter(cf->coeffs);
777                         else
778                                 dequantize_intra(cf->coeffs);
779
780                         ifwht(cf->coeffs, cf->de_fwht,
781                               (stat & PFRAME_BIT) ? 0 : 1);
782
783                         copies = (stat & DUPS_MASK) >> 1;
784                         if (copies)
785                                 memcpy(copy, cf->de_fwht, sizeof(copy));
786                         if (stat & PFRAME_BIT)
787                                 add_deltas(cf->de_fwht, refp, width);
788                         fill_decoder_block(refp, cf->de_fwht, width);
789                 }
790         }
791 }
792
793 void decode_frame(struct cframe *cf, struct raw_frame *ref, u32 hdr_flags)
794 {
795         const __be16 *rlco = cf->rlc_data;
796
797         decode_plane(cf, &rlco, ref->luma, cf->height, cf->width,
798                      hdr_flags & VICODEC_FL_LUMA_IS_UNCOMPRESSED);
799         decode_plane(cf, &rlco, ref->cb, cf->height / 2, cf->width / 2,
800                      hdr_flags & VICODEC_FL_CB_IS_UNCOMPRESSED);
801         decode_plane(cf, &rlco, ref->cr, cf->height / 2, cf->width / 2,
802                      hdr_flags & VICODEC_FL_CR_IS_UNCOMPRESSED);
803 }