-
Notifications
You must be signed in to change notification settings - Fork 0
/
pnggccrd.c
5397 lines (4818 loc) · 229 KB
/
pnggccrd.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/* pnggccrd.c - mixed C/assembler version of utilities to read a PNG file
*
* For Intel x86 CPU (Pentium-MMX or later) and GNU C compiler.
*
* See http://www.intel.com/drg/pentiumII/appnotes/916/916.htm
* and http://www.intel.com/drg/pentiumII/appnotes/923/923.htm
* for Intel's performance analysis of the MMX vs. non-MMX code.
*
* libpng version 1.2.5 - October 3, 2002
* For conditions of distribution and use, see copyright notice in png.h
* Copyright (c) 1998-2002 Glenn Randers-Pehrson
* Copyright (c) 1998, Intel Corporation
*
* Based on MSVC code contributed by Nirav Chhatrapati, Intel Corp., 1998.
* Interface to libpng contributed by Gilles Vollant, 1999.
* GNU C port by Greg Roelofs, 1999-2001.
*
* Lines 2350-4300 converted in place with intel2gas 1.3.1:
*
* intel2gas -mdI pnggccrd.c.partially-msvc -o pnggccrd.c
*
* and then cleaned up by hand. See http://hermes.terminal.at/intel2gas/ .
*
* NOTE: A sufficiently recent version of GNU as (or as.exe under DOS/Windows)
* is required to assemble the newer MMX instructions such as movq.
* For djgpp, see
*
* ftp://ftp.simtel.net/pub/simtelnet/gnu/djgpp/v2gnu/bnu281b.zip
*
* (or a later version in the same directory). For Linux, check your
* distribution's web site(s) or try these links:
*
* http://rufus.w3.org/linux/RPM/binutils.html
* http://www.debian.org/Packages/stable/devel/binutils.html
* ftp://ftp.slackware.com/pub/linux/slackware/slackware/slakware/d1/
* binutils.tgz
*
* For other platforms, see the main GNU site:
*
* ftp://ftp.gnu.org/pub/gnu/binutils/
*
* Version 2.5.2l.15 is definitely too old...
*/
/*
* TEMPORARY PORTING NOTES AND CHANGELOG (mostly by Greg Roelofs)
* =====================================
*
* 19991006:
* - fixed sign error in post-MMX cleanup code (16- & 32-bit cases)
*
* 19991007:
* - additional optimizations (possible or definite):
* x [DONE] write MMX code for 64-bit case (pixel_bytes == 8) [not tested]
* - write MMX code for 48-bit case (pixel_bytes == 6)
* - figure out what's up with 24-bit case (pixel_bytes == 3):
* why subtract 8 from width_mmx in the pass 4/5 case?
* (only width_mmx case) (near line 1606)
* x [DONE] replace pixel_bytes within each block with the true
* constant value (or are compilers smart enough to do that?)
* - rewrite all MMX interlacing code so it's aligned with
* the *beginning* of the row buffer, not the end. This
* would not only allow one to eliminate half of the memory
* writes for odd passes (that is, pass == odd), it may also
* eliminate some unaligned-data-access exceptions (assuming
* there's a penalty for not aligning 64-bit accesses on
* 64-bit boundaries). The only catch is that the "leftover"
* pixel(s) at the end of the row would have to be saved,
* but there are enough unused MMX registers in every case,
* so this is not a problem. A further benefit is that the
* post-MMX cleanup code (C code) in at least some of the
* cases could be done within the assembler block.
* x [DONE] the "v3 v2 v1 v0 v7 v6 v5 v4" comments are confusing,
* inconsistent, and don't match the MMX Programmer's Reference
* Manual conventions anyway. They should be changed to
* "b7 b6 b5 b4 b3 b2 b1 b0," where b0 indicates the byte that
* was lowest in memory (e.g., corresponding to a left pixel)
* and b7 is the byte that was highest (e.g., a right pixel).
*
* 19991016:
* - Brennan's Guide notwithstanding, gcc under Linux does *not*
* want globals prefixed by underscores when referencing them--
* i.e., if the variable is const4, then refer to it as const4,
* not _const4. This seems to be a djgpp-specific requirement.
* Also, such variables apparently *must* be declared outside
* of functions; neither static nor automatic variables work if
* defined within the scope of a single function, but both
* static and truly global (multi-module) variables work fine.
*
* 19991023:
* - fixed png_combine_row() non-MMX replication bug (odd passes only?)
* - switched from string-concatenation-with-macros to cleaner method of
* renaming global variables for djgpp--i.e., always use prefixes in
* inlined assembler code (== strings) and conditionally rename the
* variables, not the other way around. Hence _const4, _mask8_0, etc.
*
* 19991024:
* - fixed mmxsupport()/png_do_read_interlace() first-row bug
* This one was severely weird: even though mmxsupport() doesn't touch
* ebx (where "row" pointer was stored), it nevertheless managed to zero
* the register (even in static/non-fPIC code--see below), which in turn
* caused png_do_read_interlace() to return prematurely on the first row of
* interlaced images (i.e., without expanding the interlaced pixels).
* Inspection of the generated assembly code didn't turn up any clues,
* although it did point at a minor optimization (i.e., get rid of
* mmx_supported_local variable and just use eax). Possibly the CPUID
* instruction is more destructive than it looks? (Not yet checked.)
* - "info gcc" was next to useless, so compared fPIC and non-fPIC assembly
* listings... Apparently register spillage has to do with ebx, since
* it's used to index the global offset table. Commenting it out of the
* input-reg lists in png_combine_row() eliminated compiler barfage, so
* ifdef'd with __PIC__ macro: if defined, use a global for unmask
*
* 19991107:
* - verified CPUID clobberage: 12-char string constant ("GenuineIntel",
* "AuthenticAMD", etc.) placed in ebx:ecx:edx. Still need to polish.
*
* 19991120:
* - made "diff" variable (now "_dif") global to simplify conversion of
* filtering routines (running out of regs, sigh). "diff" is still used
* in interlacing routines, however.
* - fixed up both versions of mmxsupport() (ORIG_THAT_USED_TO_CLOBBER_EBX
* macro determines which is used); original not yet tested.
*
* 20000213:
* - when compiling with gcc, be sure to use -fomit-frame-pointer
*
* 20000319:
* - fixed a register-name typo in png_do_read_interlace(), default (MMX) case,
* pass == 4 or 5, that caused visible corruption of interlaced images
*
* 20000623:
* - Various problems were reported with gcc 2.95.2 in the Cygwin environment,
* many of the form "forbidden register 0 (ax) was spilled for class AREG."
* This is explained at http://gcc.gnu.org/fom_serv/cache/23.html, and
* Chuck Wilson supplied a patch involving dummy output registers. See
* http://sourceforge.net/bugs/?func=detailbug&bug_id=108741&group_id=5624
* for the original (anonymous) SourceForge bug report.
*
* 20000706:
* - Chuck Wilson passed along these remaining gcc 2.95.2 errors:
* pnggccrd.c: In function `png_combine_row':
* pnggccrd.c:525: more than 10 operands in `asm'
* pnggccrd.c:669: more than 10 operands in `asm'
* pnggccrd.c:828: more than 10 operands in `asm'
* pnggccrd.c:994: more than 10 operands in `asm'
* pnggccrd.c:1177: more than 10 operands in `asm'
* They are all the same problem and can be worked around by using the
* global _unmask variable unconditionally, not just in the -fPIC case.
* Reportedly earlier versions of gcc also have the problem with more than
* 10 operands; they just don't report it. Much strangeness ensues, etc.
*
* 20000729:
* - enabled png_read_filter_row_mmx_up() (shortest remaining unconverted
* MMX routine); began converting png_read_filter_row_mmx_sub()
* - to finish remaining sections:
* - clean up indentation and comments
* - preload local variables
* - add output and input regs (order of former determines numerical
* mapping of latter)
* - avoid all usage of ebx (including bx, bh, bl) register [20000823]
* - remove "$" from addressing of Shift and Mask variables [20000823]
*
* 20000731:
* - global union vars causing segfaults in png_read_filter_row_mmx_sub()?
*
* 20000822:
* - ARGH, stupid png_read_filter_row_mmx_sub() segfault only happens with
* shared-library (-fPIC) version! Code works just fine as part of static
* library. Damn damn damn damn damn, should have tested that sooner.
* ebx is getting clobbered again (explicitly this time); need to save it
* on stack or rewrite asm code to avoid using it altogether. Blargh!
*
* 20000823:
* - first section was trickiest; all remaining sections have ebx -> edx now.
* (-fPIC works again.) Also added missing underscores to various Shift*
* and *Mask* globals and got rid of leading "$" signs.
*
* 20000826:
* - added visual separators to help navigate microscopic printed copies
* (http://pobox.com/~newt/code/gpr-latest.zip, mode 10); started working
* on png_read_filter_row_mmx_avg()
*
* 20000828:
* - finished png_read_filter_row_mmx_avg(): only Paeth left! (930 lines...)
* What the hell, did png_read_filter_row_mmx_paeth(), too. Comments not
* cleaned up/shortened in either routine, but functionality is complete
* and seems to be working fine.
*
* 20000829:
* - ahhh, figured out last(?) bit of gcc/gas asm-fu: if register is listed
* as an input reg (with dummy output variables, etc.), then it *cannot*
* also appear in the clobber list or gcc 2.95.2 will barf. The solution
* is simple enough...
*
* 20000914:
* - bug in png_read_filter_row_mmx_avg(): 16-bit grayscale not handled
* correctly (but 48-bit RGB just fine)
*
* 20000916:
* - fixed bug in png_read_filter_row_mmx_avg(), bpp == 2 case; three errors:
* - "_ShiftBpp.use = 24;" should have been "_ShiftBpp.use = 16;"
* - "_ShiftRem.use = 40;" should have been "_ShiftRem.use = 48;"
* - "psllq _ShiftRem, %%mm2" should have been "psrlq _ShiftRem, %%mm2"
*
* 20010101:
* - added new png_init_mmx_flags() function (here only because it needs to
* call mmxsupport(), which should probably become global png_mmxsupport());
* modified other MMX routines to run conditionally (png_ptr->asm_flags)
*
* 20010103:
* - renamed mmxsupport() to png_mmx_support(), with auto-set of mmx_supported,
* and made it public; moved png_init_mmx_flags() to png.c as internal func
*
* 20010104:
* - removed dependency on png_read_filter_row_c() (C code already duplicated
* within MMX version of png_read_filter_row()) so no longer necessary to
* compile it into pngrutil.o
*
* 20010310:
* - fixed buffer-overrun bug in png_combine_row() C code (non-MMX)
*
* 20020304:
* - eliminated incorrect use of width_mmx in pixel_bytes == 8 case
*
* STILL TO DO:
* - test png_do_read_interlace() 64-bit case (pixel_bytes == 8)
* - write MMX code for 48-bit case (pixel_bytes == 6)
* - figure out what's up with 24-bit case (pixel_bytes == 3):
* why subtract 8 from width_mmx in the pass 4/5 case?
* (only width_mmx case) (near line 1606)
* - rewrite all MMX interlacing code so it's aligned with beginning
* of the row buffer, not the end (see 19991007 for details)
* x pick one version of mmxsupport() and get rid of the other
* - add error messages to any remaining bogus default cases
* - enable pixel_depth == 8 cases in png_read_filter_row()? (test speed)
* x add support for runtime enable/disable/query of various MMX routines
*/
#define PNG_INTERNAL
#include "png.h"
#if defined(PNG_USE_PNGGCCRD)
int PNGAPI png_mmx_support(void);
#ifdef PNG_USE_LOCAL_ARRAYS
static const int FARDATA png_pass_start[7] = {0, 4, 0, 2, 0, 1, 0};
static const int FARDATA png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
static const int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
#endif
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
/* djgpp, Win32, and Cygwin add their own underscores to global variables,
* so define them without: */
#if defined(__DJGPP__) || defined(WIN32) || defined(__CYGWIN__)
# define _mmx_supported mmx_supported
# define _const4 const4
# define _const6 const6
# define _mask8_0 mask8_0
# define _mask16_1 mask16_1
# define _mask16_0 mask16_0
# define _mask24_2 mask24_2
# define _mask24_1 mask24_1
# define _mask24_0 mask24_0
# define _mask32_3 mask32_3
# define _mask32_2 mask32_2
# define _mask32_1 mask32_1
# define _mask32_0 mask32_0
# define _mask48_5 mask48_5
# define _mask48_4 mask48_4
# define _mask48_3 mask48_3
# define _mask48_2 mask48_2
# define _mask48_1 mask48_1
# define _mask48_0 mask48_0
# define _LBCarryMask LBCarryMask
# define _HBClearMask HBClearMask
# define _ActiveMask ActiveMask
# define _ActiveMask2 ActiveMask2
# define _ActiveMaskEnd ActiveMaskEnd
# define _ShiftBpp ShiftBpp
# define _ShiftRem ShiftRem
#ifdef PNG_THREAD_UNSAFE_OK
# define _unmask unmask
# define _FullLength FullLength
# define _MMXLength MMXLength
# define _dif dif
# define _patemp patemp
# define _pbtemp pbtemp
# define _pctemp pctemp
#endif
#endif
/* These constants are used in the inlined MMX assembly code.
Ignore gcc's "At top level: defined but not used" warnings. */
/* GRR 20000706: originally _unmask was needed only when compiling with -fPIC,
* since that case uses the %ebx register for indexing the Global Offset Table
* and there were no other registers available. But gcc 2.95 and later emit
* "more than 10 operands in `asm'" errors when %ebx is used to preload unmask
* in the non-PIC case, so we'll just use the global unconditionally now.
*/
#ifdef PNG_THREAD_UNSAFE_OK
static int _unmask;
#endif
static unsigned long long _mask8_0 = 0x0102040810204080LL;
static unsigned long long _mask16_1 = 0x0101020204040808LL;
static unsigned long long _mask16_0 = 0x1010202040408080LL;
static unsigned long long _mask24_2 = 0x0101010202020404LL;
static unsigned long long _mask24_1 = 0x0408080810101020LL;
static unsigned long long _mask24_0 = 0x2020404040808080LL;
static unsigned long long _mask32_3 = 0x0101010102020202LL;
static unsigned long long _mask32_2 = 0x0404040408080808LL;
static unsigned long long _mask32_1 = 0x1010101020202020LL;
static unsigned long long _mask32_0 = 0x4040404080808080LL;
static unsigned long long _mask48_5 = 0x0101010101010202LL;
static unsigned long long _mask48_4 = 0x0202020204040404LL;
static unsigned long long _mask48_3 = 0x0404080808080808LL;
static unsigned long long _mask48_2 = 0x1010101010102020LL;
static unsigned long long _mask48_1 = 0x2020202040404040LL;
static unsigned long long _mask48_0 = 0x4040808080808080LL;
static unsigned long long _const4 = 0x0000000000FFFFFFLL;
//static unsigned long long _const5 = 0x000000FFFFFF0000LL; // NOT USED
static unsigned long long _const6 = 0x00000000000000FFLL;
// These are used in the row-filter routines and should/would be local
// variables if not for gcc addressing limitations.
// WARNING: Their presence probably defeats the thread safety of libpng.
#ifdef PNG_THREAD_UNSAFE_OK
static png_uint_32 _FullLength;
static png_uint_32 _MMXLength;
static int _dif;
static int _patemp; // temp variables for Paeth routine
static int _pbtemp;
static int _pctemp;
#endif
void /* PRIVATE */
png_squelch_warnings(void)
{
#ifdef PNG_THREAD_UNSAFE_OK
_dif = _dif;
_patemp = _patemp;
_pbtemp = _pbtemp;
_pctemp = _pctemp;
_MMXLength = _MMXLength;
#endif
_const4 = _const4;
_const6 = _const6;
_mask8_0 = _mask8_0;
_mask16_1 = _mask16_1;
_mask16_0 = _mask16_0;
_mask24_2 = _mask24_2;
_mask24_1 = _mask24_1;
_mask24_0 = _mask24_0;
_mask32_3 = _mask32_3;
_mask32_2 = _mask32_2;
_mask32_1 = _mask32_1;
_mask32_0 = _mask32_0;
_mask48_5 = _mask48_5;
_mask48_4 = _mask48_4;
_mask48_3 = _mask48_3;
_mask48_2 = _mask48_2;
_mask48_1 = _mask48_1;
_mask48_0 = _mask48_0;
}
#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
static int _mmx_supported = 2;
/*===========================================================================*/
/* */
/* P N G _ C O M B I N E _ R O W */
/* */
/*===========================================================================*/
#if defined(PNG_HAVE_ASSEMBLER_COMBINE_ROW)
#define BPP2 2
#define BPP3 3 /* bytes per pixel (a.k.a. pixel_bytes) */
#define BPP4 4
#define BPP6 6 /* (defined only to help avoid cut-and-paste errors) */
#define BPP8 8
/* Combines the row recently read in with the previous row.
This routine takes care of alpha and transparency if requested.
This routine also handles the two methods of progressive display
of interlaced images, depending on the mask value.
The mask value describes which pixels are to be combined with
the row. The pattern always repeats every 8 pixels, so just 8
bits are needed. A one indicates the pixel is to be combined; a
zero indicates the pixel is to be skipped. This is in addition
to any alpha or transparency value associated with the pixel.
If you want all pixels to be combined, pass 0xff (255) in mask. */
/* Use this routine for the x86 platform - it uses a faster MMX routine
if the machine supports MMX. */
void /* PRIVATE */
png_combine_row(png_structp png_ptr, png_bytep row, int mask)
{
png_debug(1, "in png_combine_row (pnggccrd.c)\n");
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
if (_mmx_supported == 2) {
/* this should have happened in png_init_mmx_flags() already */
png_warning(png_ptr, "asm_flags may not have been initialized");
png_mmx_support();
}
#endif
if (mask == 0xff)
{
png_debug(2,"mask == 0xff: doing single png_memcpy()\n");
png_memcpy(row, png_ptr->row_buf + 1,
(png_size_t)((png_ptr->width * png_ptr->row_info.pixel_depth + 7) >> 3));
}
else /* (png_combine_row() is never called with mask == 0) */
{
switch (png_ptr->row_info.pixel_depth)
{
case 1: /* png_ptr->row_info.pixel_depth */
{
png_bytep sp;
png_bytep dp;
int s_inc, s_start, s_end;
int m;
int shift;
png_uint_32 i;
sp = png_ptr->row_buf + 1;
dp = row;
m = 0x80;
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
if (png_ptr->transformations & PNG_PACKSWAP)
{
s_start = 0;
s_end = 7;
s_inc = 1;
}
else
#endif
{
s_start = 7;
s_end = 0;
s_inc = -1;
}
shift = s_start;
for (i = 0; i < png_ptr->width; i++)
{
if (m & mask)
{
int value;
value = (*sp >> shift) & 0x1;
*dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
*dp |= (png_byte)(value << shift);
}
if (shift == s_end)
{
shift = s_start;
sp++;
dp++;
}
else
shift += s_inc;
if (m == 1)
m = 0x80;
else
m >>= 1;
}
break;
}
case 2: /* png_ptr->row_info.pixel_depth */
{
png_bytep sp;
png_bytep dp;
int s_start, s_end, s_inc;
int m;
int shift;
png_uint_32 i;
int value;
sp = png_ptr->row_buf + 1;
dp = row;
m = 0x80;
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
if (png_ptr->transformations & PNG_PACKSWAP)
{
s_start = 0;
s_end = 6;
s_inc = 2;
}
else
#endif
{
s_start = 6;
s_end = 0;
s_inc = -2;
}
shift = s_start;
for (i = 0; i < png_ptr->width; i++)
{
if (m & mask)
{
value = (*sp >> shift) & 0x3;
*dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
*dp |= (png_byte)(value << shift);
}
if (shift == s_end)
{
shift = s_start;
sp++;
dp++;
}
else
shift += s_inc;
if (m == 1)
m = 0x80;
else
m >>= 1;
}
break;
}
case 4: /* png_ptr->row_info.pixel_depth */
{
png_bytep sp;
png_bytep dp;
int s_start, s_end, s_inc;
int m;
int shift;
png_uint_32 i;
int value;
sp = png_ptr->row_buf + 1;
dp = row;
m = 0x80;
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
if (png_ptr->transformations & PNG_PACKSWAP)
{
s_start = 0;
s_end = 4;
s_inc = 4;
}
else
#endif
{
s_start = 4;
s_end = 0;
s_inc = -4;
}
shift = s_start;
for (i = 0; i < png_ptr->width; i++)
{
if (m & mask)
{
value = (*sp >> shift) & 0xf;
*dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
*dp |= (png_byte)(value << shift);
}
if (shift == s_end)
{
shift = s_start;
sp++;
dp++;
}
else
shift += s_inc;
if (m == 1)
m = 0x80;
else
m >>= 1;
}
break;
}
case 8: /* png_ptr->row_info.pixel_depth */
{
png_bytep srcptr;
png_bytep dstptr;
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
#if !defined(PNG_1_0_X)
if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
/* && _mmx_supported */ )
#else
if (_mmx_supported)
#endif
{
png_uint_32 len;
int diff;
int dummy_value_a; // fix 'forbidden register spilled' error
int dummy_value_d;
int dummy_value_c;
int dummy_value_S;
int dummy_value_D;
_unmask = ~mask; // global variable for -fPIC version
srcptr = png_ptr->row_buf + 1;
dstptr = row;
len = png_ptr->width &~7; // reduce to multiple of 8
diff = (int) (png_ptr->width & 7); // amount lost
__asm__ __volatile__ (
"movd _unmask, %%mm7 \n\t" // load bit pattern
"psubb %%mm6, %%mm6 \n\t" // zero mm6
"punpcklbw %%mm7, %%mm7 \n\t"
"punpcklwd %%mm7, %%mm7 \n\t"
"punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
"movq _mask8_0, %%mm0 \n\t"
"pand %%mm7, %%mm0 \n\t" // nonzero if keep byte
"pcmpeqb %%mm6, %%mm0 \n\t" // zeros->1s, v versa
// preload "movl len, %%ecx \n\t" // load length of line
// preload "movl srcptr, %%esi \n\t" // load source
// preload "movl dstptr, %%edi \n\t" // load dest
"cmpl $0, %%ecx \n\t" // len == 0 ?
"je mainloop8end \n\t"
"mainloop8: \n\t"
"movq (%%esi), %%mm4 \n\t" // *srcptr
"pand %%mm0, %%mm4 \n\t"
"movq %%mm0, %%mm6 \n\t"
"pandn (%%edi), %%mm6 \n\t" // *dstptr
"por %%mm6, %%mm4 \n\t"
"movq %%mm4, (%%edi) \n\t"
"addl $8, %%esi \n\t" // inc by 8 bytes processed
"addl $8, %%edi \n\t"
"subl $8, %%ecx \n\t" // dec by 8 pixels processed
"ja mainloop8 \n\t"
"mainloop8end: \n\t"
// preload "movl diff, %%ecx \n\t" // (diff is in eax)
"movl %%eax, %%ecx \n\t"
"cmpl $0, %%ecx \n\t"
"jz end8 \n\t"
// preload "movl mask, %%edx \n\t"
"sall $24, %%edx \n\t" // make low byte, high byte
"secondloop8: \n\t"
"sall %%edx \n\t" // move high bit to CF
"jnc skip8 \n\t" // if CF = 0
"movb (%%esi), %%al \n\t"
"movb %%al, (%%edi) \n\t"
"skip8: \n\t"
"incl %%esi \n\t"
"incl %%edi \n\t"
"decl %%ecx \n\t"
"jnz secondloop8 \n\t"
"end8: \n\t"
"EMMS \n\t" // DONE
: "=a" (dummy_value_a), // output regs (dummy)
"=d" (dummy_value_d),
"=c" (dummy_value_c),
"=S" (dummy_value_S),
"=D" (dummy_value_D)
: "3" (srcptr), // esi // input regs
"4" (dstptr), // edi
"0" (diff), // eax
// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
"2" (len), // ecx
"1" (mask) // edx
#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
: "%mm0", "%mm4", "%mm6", "%mm7" // clobber list
#endif
);
}
else /* mmx _not supported - Use modified C routine */
#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
{
register png_uint_32 i;
png_uint_32 initial_val = png_pass_start[png_ptr->pass];
/* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
register int stride = png_pass_inc[png_ptr->pass];
/* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
register int rep_bytes = png_pass_width[png_ptr->pass];
/* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
int diff = (int) (png_ptr->width & 7); /* amount lost */
register png_uint_32 final_val = len; /* GRR bugfix */
srcptr = png_ptr->row_buf + 1 + initial_val;
dstptr = row + initial_val;
for (i = initial_val; i < final_val; i += stride)
{
png_memcpy(dstptr, srcptr, rep_bytes);
srcptr += stride;
dstptr += stride;
}
if (diff) /* number of leftover pixels: 3 for pngtest */
{
final_val+=diff /* *BPP1 */ ;
for (; i < final_val; i += stride)
{
if (rep_bytes > (int)(final_val-i))
rep_bytes = (int)(final_val-i);
png_memcpy(dstptr, srcptr, rep_bytes);
srcptr += stride;
dstptr += stride;
}
}
} /* end of else (_mmx_supported) */
break;
} /* end 8 bpp */
case 16: /* png_ptr->row_info.pixel_depth */
{
png_bytep srcptr;
png_bytep dstptr;
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
#if !defined(PNG_1_0_X)
if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
/* && _mmx_supported */ )
#else
if (_mmx_supported)
#endif
{
png_uint_32 len;
int diff;
int dummy_value_a; // fix 'forbidden register spilled' error
int dummy_value_d;
int dummy_value_c;
int dummy_value_S;
int dummy_value_D;
_unmask = ~mask; // global variable for -fPIC version
srcptr = png_ptr->row_buf + 1;
dstptr = row;
len = png_ptr->width &~7; // reduce to multiple of 8
diff = (int) (png_ptr->width & 7); // amount lost //
__asm__ __volatile__ (
"movd _unmask, %%mm7 \n\t" // load bit pattern
"psubb %%mm6, %%mm6 \n\t" // zero mm6
"punpcklbw %%mm7, %%mm7 \n\t"
"punpcklwd %%mm7, %%mm7 \n\t"
"punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
"movq _mask16_0, %%mm0 \n\t"
"movq _mask16_1, %%mm1 \n\t"
"pand %%mm7, %%mm0 \n\t"
"pand %%mm7, %%mm1 \n\t"
"pcmpeqb %%mm6, %%mm0 \n\t"
"pcmpeqb %%mm6, %%mm1 \n\t"
// preload "movl len, %%ecx \n\t" // load length of line
// preload "movl srcptr, %%esi \n\t" // load source
// preload "movl dstptr, %%edi \n\t" // load dest
"cmpl $0, %%ecx \n\t"
"jz mainloop16end \n\t"
"mainloop16: \n\t"
"movq (%%esi), %%mm4 \n\t"
"pand %%mm0, %%mm4 \n\t"
"movq %%mm0, %%mm6 \n\t"
"movq (%%edi), %%mm7 \n\t"
"pandn %%mm7, %%mm6 \n\t"
"por %%mm6, %%mm4 \n\t"
"movq %%mm4, (%%edi) \n\t"
"movq 8(%%esi), %%mm5 \n\t"
"pand %%mm1, %%mm5 \n\t"
"movq %%mm1, %%mm7 \n\t"
"movq 8(%%edi), %%mm6 \n\t"
"pandn %%mm6, %%mm7 \n\t"
"por %%mm7, %%mm5 \n\t"
"movq %%mm5, 8(%%edi) \n\t"
"addl $16, %%esi \n\t" // inc by 16 bytes processed
"addl $16, %%edi \n\t"
"subl $8, %%ecx \n\t" // dec by 8 pixels processed
"ja mainloop16 \n\t"
"mainloop16end: \n\t"
// preload "movl diff, %%ecx \n\t" // (diff is in eax)
"movl %%eax, %%ecx \n\t"
"cmpl $0, %%ecx \n\t"
"jz end16 \n\t"
// preload "movl mask, %%edx \n\t"
"sall $24, %%edx \n\t" // make low byte, high byte
"secondloop16: \n\t"
"sall %%edx \n\t" // move high bit to CF
"jnc skip16 \n\t" // if CF = 0
"movw (%%esi), %%ax \n\t"
"movw %%ax, (%%edi) \n\t"
"skip16: \n\t"
"addl $2, %%esi \n\t"
"addl $2, %%edi \n\t"
"decl %%ecx \n\t"
"jnz secondloop16 \n\t"
"end16: \n\t"
"EMMS \n\t" // DONE
: "=a" (dummy_value_a), // output regs (dummy)
"=c" (dummy_value_c),
"=d" (dummy_value_d),
"=S" (dummy_value_S),
"=D" (dummy_value_D)
: "0" (diff), // eax // input regs
// was (unmask) " " RESERVED // ebx // Global Offset Table idx
"1" (len), // ecx
"2" (mask), // edx
"3" (srcptr), // esi
"4" (dstptr) // edi
#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
: "%mm0", "%mm1", "%mm4" // clobber list
, "%mm5", "%mm6", "%mm7"
#endif
);
}
else /* mmx _not supported - Use modified C routine */
#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
{
register png_uint_32 i;
png_uint_32 initial_val = BPP2 * png_pass_start[png_ptr->pass];
/* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
register int stride = BPP2 * png_pass_inc[png_ptr->pass];
/* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
register int rep_bytes = BPP2 * png_pass_width[png_ptr->pass];
/* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
int diff = (int) (png_ptr->width & 7); /* amount lost */
register png_uint_32 final_val = BPP2 * len; /* GRR bugfix */
srcptr = png_ptr->row_buf + 1 + initial_val;
dstptr = row + initial_val;
for (i = initial_val; i < final_val; i += stride)
{
png_memcpy(dstptr, srcptr, rep_bytes);
srcptr += stride;
dstptr += stride;
}
if (diff) /* number of leftover pixels: 3 for pngtest */
{
final_val+=diff*BPP2;
for (; i < final_val; i += stride)
{
if (rep_bytes > (int)(final_val-i))
rep_bytes = (int)(final_val-i);
png_memcpy(dstptr, srcptr, rep_bytes);
srcptr += stride;
dstptr += stride;
}
}
} /* end of else (_mmx_supported) */
break;
} /* end 16 bpp */
case 24: /* png_ptr->row_info.pixel_depth */
{
png_bytep srcptr;
png_bytep dstptr;
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
#if !defined(PNG_1_0_X)
if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
/* && _mmx_supported */ )
#else
if (_mmx_supported)
#endif
{
png_uint_32 len;
int diff;
int dummy_value_a; // fix 'forbidden register spilled' error
int dummy_value_d;
int dummy_value_c;
int dummy_value_S;
int dummy_value_D;
_unmask = ~mask; // global variable for -fPIC version
srcptr = png_ptr->row_buf + 1;
dstptr = row;
len = png_ptr->width &~7; // reduce to multiple of 8
diff = (int) (png_ptr->width & 7); // amount lost //
__asm__ __volatile__ (
"movd _unmask, %%mm7 \n\t" // load bit pattern
"psubb %%mm6, %%mm6 \n\t" // zero mm6
"punpcklbw %%mm7, %%mm7 \n\t"
"punpcklwd %%mm7, %%mm7 \n\t"
"punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
"movq _mask24_0, %%mm0 \n\t"
"movq _mask24_1, %%mm1 \n\t"
"movq _mask24_2, %%mm2 \n\t"
"pand %%mm7, %%mm0 \n\t"
"pand %%mm7, %%mm1 \n\t"
"pand %%mm7, %%mm2 \n\t"
"pcmpeqb %%mm6, %%mm0 \n\t"
"pcmpeqb %%mm6, %%mm1 \n\t"
"pcmpeqb %%mm6, %%mm2 \n\t"
// preload "movl len, %%ecx \n\t" // load length of line
// preload "movl srcptr, %%esi \n\t" // load source
// preload "movl dstptr, %%edi \n\t" // load dest
"cmpl $0, %%ecx \n\t"
"jz mainloop24end \n\t"
"mainloop24: \n\t"
"movq (%%esi), %%mm4 \n\t"
"pand %%mm0, %%mm4 \n\t"
"movq %%mm0, %%mm6 \n\t"
"movq (%%edi), %%mm7 \n\t"
"pandn %%mm7, %%mm6 \n\t"
"por %%mm6, %%mm4 \n\t"
"movq %%mm4, (%%edi) \n\t"
"movq 8(%%esi), %%mm5 \n\t"
"pand %%mm1, %%mm5 \n\t"
"movq %%mm1, %%mm7 \n\t"
"movq 8(%%edi), %%mm6 \n\t"
"pandn %%mm6, %%mm7 \n\t"
"por %%mm7, %%mm5 \n\t"
"movq %%mm5, 8(%%edi) \n\t"
"movq 16(%%esi), %%mm6 \n\t"
"pand %%mm2, %%mm6 \n\t"
"movq %%mm2, %%mm4 \n\t"
"movq 16(%%edi), %%mm7 \n\t"
"pandn %%mm7, %%mm4 \n\t"
"por %%mm4, %%mm6 \n\t"
"movq %%mm6, 16(%%edi) \n\t"
"addl $24, %%esi \n\t" // inc by 24 bytes processed
"addl $24, %%edi \n\t"
"subl $8, %%ecx \n\t" // dec by 8 pixels processed
"ja mainloop24 \n\t"
"mainloop24end: \n\t"
// preload "movl diff, %%ecx \n\t" // (diff is in eax)
"movl %%eax, %%ecx \n\t"
"cmpl $0, %%ecx \n\t"
"jz end24 \n\t"
// preload "movl mask, %%edx \n\t"
"sall $24, %%edx \n\t" // make low byte, high byte
"secondloop24: \n\t"
"sall %%edx \n\t" // move high bit to CF
"jnc skip24 \n\t" // if CF = 0
"movw (%%esi), %%ax \n\t"
"movw %%ax, (%%edi) \n\t"
"xorl %%eax, %%eax \n\t"
"movb 2(%%esi), %%al \n\t"
"movb %%al, 2(%%edi) \n\t"
"skip24: \n\t"
"addl $3, %%esi \n\t"
"addl $3, %%edi \n\t"
"decl %%ecx \n\t"
"jnz secondloop24 \n\t"
"end24: \n\t"
"EMMS \n\t" // DONE
: "=a" (dummy_value_a), // output regs (dummy)
"=d" (dummy_value_d),
"=c" (dummy_value_c),
"=S" (dummy_value_S),