-
Notifications
You must be signed in to change notification settings - Fork 5
/
XDC_COMP.PAS
1327 lines (1217 loc) · 47.5 KB
/
XDC_COMP.PAS
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
program xdc_comp;
{$I+}
{
"xdc" (x86 differential compiler) animation compiler for 8088+CGA systems.
The basic operation of the animation system is to determine differences
between frames and turn those differences into x86 code that can be directly
executed to update the visible screen buffer. By staying within available
bandwidth, we can stress the system no more than we do with the earlier
8088flex system, which has known performance characteristics on real 8088+CGA
hardware.
The major bandwidth constraint is time it takes to update the screen, which we
will initially cap at the same time a 2000-byte REP MOVSW takes to execute.
Secondary bandwidth constraint is the size of the compiled code, which must
not exceed whatever limits the user requests.
Some useful terminology that will help you understand this source code:
- Delta: An area in a new frame that is different than the same area in the
old/previous frame. This area will need to be updated in the visible screen
buffer. Areas are defined as a screen buffer starting and ending offset. (A
Delta can be though of as a "strip" of new frame data 1 pixel high and 1..N
bytes long.)
- Slice: A Delta that contains at least two different byte values.
- Run: A Delta consisting of only one value. A run can be encoded using
an optimized x86 form (REP STOSW) that is faster and takes up less space than
REP MOVSW.
- Delta List: (abbreviated as List) A data structure that helps us manipulate
and optimize all found deltas. This can be implemented any number of ways
(trie, linked list, etc.) but I choose to implement as a TSortedCollection
(sorted array of object pointers) for speed and simplicity.
- Code bytes: CS:IP x86 opcodes that execute directly to update the screen.
- Data bytes: DS:SI data that is copied to the screen by code bytes.
Some design decisions that will help you understand this code:
- All deltas are stored in a data structure that is always maintained sorted
from "largest onscreen change of bytes" to the smallest. In other words,
the first entries in the list of deltas are always the ones that change the
most amount of pixels on the screen, and then entries at the end of the list
change the fewest. This automatic, enforced sorting helps to simplify some
decision-making logic when optimizing and encoding deltas to x86 code. (This
will help you understand why some processing occurs in specific directions
up or down the list.)
- Source was kept 16-bit real-mode DOS compatible so that the option of
compiling an animation directly on the target hardware, using measurements
taken directly from the hardware, was kept open as a future option. This
would allow proper encoding/timing on non-8088 non-CGA architectures, such as
286+VGA. (The compiler currently only counts cycles for an 8088+CGA target.)
My sincere apologies for this -- someday I or someone else will convert it
to a modern 64-bit environment...
- 16-bit DOS architecture uses segmented pointers. This code enforces and
relies on normalized pointers (where offset is 0) to reduce required logic
and increase processing speed.
- ALL optimizations are for size. The only case where this has a guaranteed
speed benefit is in run handling, as runs are replayed with REP MOVS.
To deal with varying input types, the compiler and muxer are controlled via an
input script with the following syntax:
# any line beginning with a hash is a comment
so are lines that begin with one or more spaces, or are blank
Comments MUST be on lines by themselves
sourcefps=xx.xx input playback rate
encodefps=xx.xx forced playback rate; meant for creating 60fps output
waveinput=filename.wav location of 8-bit PCM audio data
!IMPORTANT! If an audio file is supplied, it means
we are not encoding to individual files, but creating
a full video+audio muxed file. The output filename
will be the same as the input script + '.xdv'
debug=x debug level in log. 0=none; 1-3 increases verbosity
screenmode=xxx Screen mode of next frame. 160, 320, 640 for now.
(for the next three, a value of "on" will turn the
feature on without changing the value, and a
numeric value will set the feature's value.)
shavepixels=xxx enables/disables pixel shaving
shavedeltas=xxx enables/disables delta shaving
combinedeltas=xxx enables/disables delta combination (slow!)
...anything else (like filename.ext) will be interpreted as a frame to load.
The above is not a complete description of the script directives; consult
the official documentation at x86dc.wordpress.com for the full list.
}
uses
support,objects,memory,dos,
xdc_globals,xdc_log,xdc_common,xdc_deltas,xdc_codegeneration,
rteh; {more descriptive runtime error messages}
const
drift:real=1.0;
frameIntegrity:byte=75;
muxing:boolean=false; {if muxing, we're creating a full video+audio file}
padAlign=secSize; {align payloads to sector sizes}
totalDumpedBytes:longint=0;
totalDumpedDeltas:longint=0;
totalEncodedBytes:longint=0;
maxEncodedBytes:longint=0;
numEncodedFrames:longint=0;
numEncodedRuns:longint=0;
numEncodedSlices:longint=0;
numUniqueFrames:longint=0;
leastCyclesRemaining:longint=$FFFFFFF;
maxEncBufSize=10*1024;
var
deltas:PDeltas;
{encoding buffers and structures:}
opcodebuf,databuf,picbuf,combinedBuf:pointer;
opcodeBufSize,dataBufSize:word;
muxf,wavf:file;
tmpHeader:XDVHeader;
abuf:pointer;
eticks:longint;
old1cint:pointer;
procedure etickint; interrupt;
begin
inc(eticks);
end;
Procedure PrintHelp;
begin
asm
push ds
jmp @start
@message:
db 'Usage: xdc_comp [input script]',0dh,0ah
db '$'
@start:
mov ax,0900h
lea dx,@message
mov bx,cs
mov ds,bx
int 21h
pop ds
end;
halt(255);
end;
procedure initCompiler;
begin
if paramcount=0 then PrintHelp;
openLogging(basename(paramstr(1))+'.LOG');
stdout(inttostr(memavail)+ ' bytes free');
nextframe:=memAllocSeg(screenRAMsize);
prevframe:=memAllocSeg(screenRAMsize);
fillchar(nextframe^,screenRAMsize,0);
fillchar(prevframe^,screenRAMsize,0);
{extremely important that both segments are normalized. If not, our speed
optimizations and assumptions will break the code.}
if (word(nextframe)<>0) or (word(prevframe)<>0)
then fatalerr('frame pointers not normalized');
{$IFDEF HP95LX}
CGARAM:=ptr($b000,0);
{$ELSE}
CGARAM:=ptr($b800,0);
{$ENDIF}
{initialize our delta data structure and allow duplicates}
deltas:=New(PDeltas,Init(1024,32));
deltas^.Duplicates:=true;
opcodeBuf:=memAllocSeg(maxEncBufSize);
dataBuf:=memAllocSeg(maxEncBufSize);
packetIndex:=memAllocSeg(maxBuf);
fillchar(packetindex^,maxBuf,0);
fillchar(tmpHeader,sizeof(tmpHeader),0);
{set up execution time logging}
eticks:=0;
GetIntVec($1C,old1cInt);
SetIntVec($1C,Addr(etickint));
end;
procedure doneCompiler;
var
numwritten,avgvid,avgpkt,maxpkt,w,w2:word;
l,maxl,avgl:longint;
s:string;
begin
{stop counting ticks}
SetIntVec($01C,old1cint);
maxpkt:=1*512;
stdout('Processed '
+inttostr(totalDumpedBytes)+' screen changes ('
+inttostr(totalDumpedDeltas)+' deltas) over '
+inttostr(numEncodedFrames)+' frames');
if numEncodedFrames<>0 then stdout('('+inttostr(numUniqueFrames)+' unique frames; effective framerate: '
+realtostr((numUniqueFrames/numEncodedFrames) * sourcefps)+' fps) (source was '+realtostr(sourcefps)+')');
stdout('Distribution of screen changes:');
l:=numEncodedRuns+numEncodedSlices;
stdout(' '+realtostr((numEncodedSlices / l) * 100)+'% slices');
stdout(' '+realtostr((numEncodedRuns / l) * 100)+'% runs');
stdout('Total compiled size: '+inttostr(totalEncodedBytes));
if totalDumpedBytes<>0 then begin
avgvid:=totalEncodedBytes div numEncodedFrames;
{determine average packet size in KB}
l:=0;
for w:=0 to numEncodedFrames-1 do inc(l,packetIndex^[w]);
l:=(l div numEncodedFrames) div 2;
avgpkt:=l;
{detemine average bitrate in KB}
avgl:=0;
for w:=0 to numEncodedFrames-1 do inc(avgl,packetIndex^[w] * 512);
avgl:=trunc((avgl / numEncodedFrames) * sourcefps);
avgl:=avgl div 1024;
{determine maximum packet size}
maxpkt:=0;
for w:=0 to numEncodedFrames-1 do
if maxpkt<packetIndex^[w] then maxpkt:=packetIndex^[w];
maxpkt:=maxpkt * 512;
{detemine max bitrate in KB}
maxl:=round(maxpkt * sourcefps) div 1024;
stdout('Expansion factor of changed bytes: '+realtostr(totalEncodedBytes/totalDumpedBytes)+':1');
stdout('Average compiled video frame: '+inttostr(avgvid));
stdout('Maximum compiled video frame: '+inttostr(maxEncodedBytes));
stdout('Least amount of remaining cycles per frame: '+inttostr(leastCyclesRemaining));
if numEncodedFrames>round(sourcefps*4) then begin
s:='Average disk I/O: '+inttostr(avgl)+' KB/s';
stdout(s);
s:='Maximum disk I/O: '+inttostr(maxl)+' KB/s';
stdout(s);
end;
end;
if muxing and (numEncodedFrames<>0) then begin
{write packet index to end of muxed stream}
blockwrite(muxf,packetIndex^,numEncodedFrames,numwritten);
if numwritten<>numEncodedFrames then stderr('disk error writing index; stream unusable');
seek(muxf,4); {location of numPackets}
blockwrite(muxf,numEncodedFrames,2);
{write largestPacket right after it}
blockwrite(muxf,maxpkt,2);
close(muxf);
close(wavf);
freemem(abuf,tmpHeader.achunksize);
end;
stdout('Closing down...');
dispose(deltas,done);
freemem(nextframe,screenRAMsize);
freemem(prevframe,screenRAMsize);
freemem(opcodeBuf,maxEncBufSize);
freemem(dataBuf,maxEncBufSize);
freemem(packetIndex,maxBuf);
stdout(inttostr(memavail)+ ' bytes free');
stdout(realtostr(eticks / (14.31818/12/65536*1000000))+' seconds elapsed');
closeLogging;
asm
mov ax,0003
int 10h
end;
end;
procedure loadbmp(s:string);
{VERY HACKY, might replace with proper library later}
type
tbmhead1=record
filetype:word;
filesize:longint;
reserved1:word;
resevved2:word;
BitmapOffset:longint;
end;
pbmhead1=^tbmhead1;
tbmhead2=record
headerSize:longint;
widthInPixels:longint;
heightInPixels:longint;
planes:word;
bitsPerPixel:word;
compmethod:longint;
rawsize:longint;
horRes:longint;
verRes:longint;
numColors:longint;
numImportantColors:longint;
end;
pbmhead2=^tbmhead2;
var
f:file;
header1:pbmhead1;
header2:pbmhead2;
sl,destbank0,destbank1:pointer;
w:word;
src_slwidth,dst_slwidth:word;
x:byte;
sa:array[0..160-1] of byte;
da:array[0..80-1] of byte;
fs:word;
lp:longint;
begin
fillchar(nextFrame^,screenRAMsize,0);
assign(f,s);
reset(f,1);
fs:=filesize(f);
picbuf:=memAllocSeg(fs);
blockread(f,picbuf^,fs,w);
close(f);
{Is this a CGADUMP file produced by cgaart? If so, process, otherwise
try to load a windows .BMP bitmap file.}
if (pos('.cga',s)<>0) or (pos('.CGA',s)<>0) then begin
sl:=picbuf;
inc(longint(sl),16252-16192-2);
move(sl^,nextframe^,16192);
end else begin
{start trying to make sense of the .bmp}
header1:=picbuf;
{Valid bitmap file?}
if header1^.filetype<>$4d42 {"BM" little-endian}
then fatalerr(s+' not a Windows bitmap file?');
if header1^.filesize > maxbuf
then fatalerr('Bitmap file too large for us to process');
{now start checking out secondary header}
header2:=pbmhead2(header1);
inc(word(header2),14);
{look for more error conditions}
if header2^.headersize<>40
then fatalerr(s+' not a Windows bitmap file?');
if header2^.compMethod<>0
then fatalerr('Bitmap must be uncompressed; this bitmap uses method '+inttostr(header2^.compMethod));
case header2^.widthInPixels of
640:begin
{make sure this is a 640x200 pic with 1-bit pixels}
if header2^.bitsPerPixel<>1
then fatalerr('640x200 bitmap uses '+inttostr(header2^.bitsPerPixel)+'-bit pixels; need 1-bit');
if header2^.heightInPixels<>200
then fatalerr('640x bitmap has '+inttostr(header2^.heightInPixels)+' lines; need 200');
if header2^.rawsize<>16000
then fatalerr('Bitmap is too large for CGA; size='+inttostr(header2^.rawsize));
sl:=pointer(header2);
inc(word(sl),40); {skip second header}
move(sl^,lp,4);
inc(word(sl),header2^.numColors*4); {skip palette N entries in RGBA format}
destbank0:=nextFrame;
destbank1:=nextFrame;
destbank1:=addptr(destbank1,8192);
src_slwidth:=header2^.widthInPixels div (8 div header2^.bitsPerPixel);
dst_slwidth:=src_slwidth;
destbank0:=addptr(destbank0,8000-dst_slwidth);
destbank1:=addptr(destbank1,8000-dst_slwidth);
for w:=0 to 100-1 do begin
move(sl^,destbank1^,dst_slwidth);
sl:=addptr(sl,src_slwidth);
destbank1:=subptr(destbank1,dst_slwidth);
move(sl^,destbank0^,dst_slwidth);
sl:=addptr(sl,src_slwidth);
destbank0:=subptr(destbank0,dst_slwidth);
end;
{different programs create B&W files differently -- we are assuming
palette is 0,0,0 then 255,255,255. If B&W palette is actually swapped,
swap the bits in the buffer:}
if lp<>0 then asm
les di,nextFrame
mov cx,screenRAMsize
shr cx,1
push ds
push es
pop ds
mov si,di
@loopit:
lodsw
xor ax,$FFFF
stosw
loop @loopit
pop ds
end;
end;
160:begin
{make sure this is a 160x200 pic with 8-bit pixels (because it is
surprisingly difficult to save a 16-color picture as 4-bit with a
custom palette using modern tools -- I guess they all removed that
functionality}
if header2^.bitsPerPixel<>8
then fatalerr('Desired 160x200 bitmap uses '+inttostr(header2^.bitsPerPixel)+'-bit pixels; need 8-bit');
sl:=pointer(header2);
inc(word(sl),40); {skip second header}
inc(word(sl),header2^.numColors*4); {skip palette N entries in RGBA format}
destbank0:=nextFrame;
destbank1:=nextFrame;
destbank1:=addptr(destbank1,8192);
src_slwidth:=header2^.widthInPixels div (8 div header2^.bitsPerPixel);
dst_slwidth:=80;
destbank0:=addptr(destbank0,8000-dst_slwidth);
destbank1:=addptr(destbank1,8000-dst_slwidth);
for w:=0 to 100-1 do begin
{convert 8-bit values to 4-bit in-place}
move(sl^,sa,sizeof(sa));
for x:=0 to 80-1 do
da[x]:=(sa[x*2] shl 4) OR sa[(x*2)+1];
move(da,destbank1^,dst_slwidth);
sl:=addptr(sl,src_slwidth);
destbank1:=subptr(destbank1,dst_slwidth);
if header2^.heightInPixels>100 then begin
move(sl^,sa,sizeof(sa));
for x:=0 to 80-1 do
da[x]:=(sa[x*2] shl 4) OR sa[(x*2)+1];
move(da,destbank0^,dst_slwidth);
sl:=addptr(sl,src_slwidth);
destbank0:=subptr(destbank0,dst_slwidth);
end;
end;
end;
240:begin
{make sure this is a 240x128 pic with 1-bit pixels}
if header2^.bitsPerPixel<>1
then fatalerr('240x128 bitmap uses '+inttostr(header2^.bitsPerPixel)+'-bit pixels; need 1-bit');
if header2^.heightInPixels<>128
then fatalerr('240x bitmap has '+inttostr(header2^.heightInPixels)+' lines; need 128');
if header2^.rawsize<>4096
then fatalerr('Bitmap is too large for 95LX; size='+inttostr(header2^.rawsize));
sl:=pointer(header2);
inc(word(sl),40); {skip second header}
move(sl^,lp,4);
inc(word(sl),header2^.numColors*4); {skip palette N entries in RGBA format}
destbank0:=nextFrame;
src_slwidth:=32;
dst_slwidth:=screenbytewidth;
destbank0:=addptr(destbank0,3840-dst_slwidth);
for w:=0 to screenbyteheight-1 do begin
move(sl^,destbank0^,dst_slwidth);
sl:=addptr(sl,src_slwidth);
destbank0:=subptr(destbank0,dst_slwidth);
end;
if lp=0 then asm { LCD is the inverse of CRT }
les di,nextFrame
mov cx,screenRAMsize
shr cx,1
push ds
push es
pop ds
mov si,di
@loopit:
lodsw
xor ax,$FFFF
stosw
loop @loopit
pop ds
end;
end;
else
fatalerr('Not a resolution we handle (yet)');
end;
end;
{if debug>0 then move(nextframe^,cgaram^,screenRAMsize);}
freemem(picbuf,fs);
end;
procedure findDeltas;
{
Compare nextframe to prevframe, determine where the delta slices are, and
insert that information into a memory structure that we can more easily
manipulate and optimize.
Implemented with a mixture of Turbo Pascal and assembler. THIS CODE IS
INCOMPATIBLE with other compilers and architectures because I am taking
advantage of knowing my pointers are normalized. If you port this to other
platforms, you'll have to rewrite this procedure!
}
var
foundStart,foundEnd,endbuf,newstart,lastfound:word;
foosrc,foodest:pointer;
begin
endbuf:=screenRAMsize+1; {to ensure we compare the last byte in the buffer}
foundstart:=0;
lastfound:=$ffff;
foundend:=0;
newstart:=0;
repeat
asm
push ds
cld
les di,nextframe {set up comparison buffer}
mov di,newstart
mov cx,endbuf {number of bytes to check}
lds si,prevframe {set up comparison buffer}
mov si,di {point to byte past last range end}
repe cmpsb {repeat while equal}
jcxz @done {if entire buffer equal, abort}
mov foundStart,si
dec foundStart {save where first nonmatch was found}
repne cmpsb {repeat while NOT equal}
mov newstart,si {record where we should resume search}
dec si {-1 to adjust landing position}
dec si {-1 to reflect actual last diff pos}
mov foundEnd,si {save where last nonmatch was found}
@done:
mov endbuf,cx
pop ds
end;
{lastfound<>foundstart is to prevent a bug in the asm
from creating a duplicate on the very last match -- need to
fix this properly someday!}
if (foundstart=0) and (foundend=0) and (endbuf=0)
then stdout('frame is identical to previous')
else if lastfound<>foundstart then begin
deltas^.Insert(new(pDelta,init(foundStart,foundEnd)));
lastfound:=foundstart;
end;
until endbuf=0;
end;
procedure encodeDeltas;
{
Runs through the delta list and encodes it as x86 opcodes, counting cycles
as well as output bytes to ensure we don't exceed our maximum draw time
or maximum storage I/O time.
As each delta is committed to code, it is deleted from the list.
Workflow:
prepare for encoding (ie. zero out buffers and counters)
encode to multiple targets
pick best target based on user preference
do we have free cycles and bytes in our pools to commit this?
if so, commit:
save to buffer, dec. cycles and byte pools, remove delta from list,
update cached registers, write to log
if not, write to log that we skipped a delta
repeat until entire list processed
}
var
cyclePool:real;
bytePool:integer;
tmpe:TEncodeTarget;
listPos:word;
dtmp:pdelta;
opcodeBufPos,dataBufPos:pointer;
bp:^byte;
procedure commitDelta(pd:pdelta;et:tEncodeTarget);
{Copy opcodes/data to respective buffers; adjust bandwidth pools appropriately}
var
p:pointer;
begin
move(et.codes,opcodeBufPos^,et.numOpcodeBytes);
inc(word(opcodeBufPos),et.numOpcodeBytes);
inc(opcodeBufSize,et.numOpcodeBytes);
{if we have opportunity to cache AL, take it}
if et.modifiedAL then lastAL:=et.AL;
if et.numDataBytes>0 then begin
p:=nextframe;
word(p):=pd^.startofs;
move(p^,databufpos^,et.numDataBytes);
inc(word(databufpos),et.numDataBytes);
inc(dataBufSize,et.numDataBytes);
end;
cyclePool:=cyclePool-et.totalCycles;
bytePool:=bytePool-et.totalBytes;
if (cyclepool<0) or (bytepool<0)
then fatalerr('Pools overcommitted; bug in calculations?');
if debug>2
then stdout('Committed '+deltainfo(pd)
+', cycle pool now '+realtostr(cyclepool)
+', byte pool now '+inttostr(bytepool));
{maintain info variables}
if leastCyclesRemaining>round(cyclePool)
then leastCyclesRemaining:=round(cyclePool);
if maxEncodedBytes < (maxVChunk-bytepool)
then maxEncodedBytes:=maxVChunk-bytepool;
inc(totalDumpedBytes,pd^.blength);
inc(totalDumpedDeltas);
if pd^.dtype=slice
then inc(numEncodedSlices)
else inc(numEncodedRuns);
deltas^.free(pd);
end;
begin
cyclePool:=cyclesPerFrame;
bytePool:=maxVChunk-frameByteOverhead;
opcodeBufSize:=0; dataBufSize:=0;
opcodeBufPos:=opcodebuf; dataBufPos:=databuf;
listPos:=0;
lastAL:=0;
tmpe.modifiedAL:=true;
{prep opcode buffer and encoding structure(s)}
fillchar(opcodebuf^,maxEncBufSize,0);
{inject frame header}
move(frameHeaderCode,opcodeBufPos^,frameheaderLen);
inc(word(opcodeBufPos),frameHeaderLen);
inc(opcodeBufSize,frameHeaderLen);
{Scan delta list top to bottom, so that we can encode the larger deltas first. If we
have an empty delta list (ie. nextframe=prevframe), this will do nothing.}
ALcached:=false;
while (deltas^.count>0) and (listPos<deltas^.Count) do begin
dtmp:=pdelta(deltas^.at(listPos));
encodeDelta(dtmp,tmpe);
if (tmpe.totalBytes <= bytePool)
and (tmpe.totalCycles < cyclePool)
then commitDelta(dtmp,tmpe)
else inc(listPos);
end;
{inject frame footer. Requires normalized pointers.}
move(frameFooterCode,opcodebufPos^,frameFooterLen);
inc(word(opcodeBufPos),frameFooterLen);
inc(opcodeBufSize,frameFooterLen);
if (word(opcodeBufPos)<>opcodeBufSize) then fatalerr('opcode pointer/size mismatch');
if (word(dataBufPos)<>dataBufSize) then fatalerr('data pointer/size mismatch');
inc(numEncodedFrames);
end;
function sliceEncodedSize(sofs,eofs:word):word;
{Function for estimating how large, in code+data, it will take to
encode a slice. This is used during the optimization stage to make
intelligent decisions about when it makes sense to combine deltas.
Should this be REPLACED at a later date by the function that ACTUALLY encodes
deltas to code? For accuracy, yes. For speed, no. In testing with actual
code generation to guage the size, the combine phase nearly ground to a halt.}
var
singlestore,repstore,blen:word;
begin
blen:=1+(eofs-sofs);
case blen of
1,2:singlestore:=3+1+blen;
3,4:singlestore:=3+2+blen;
5,6:singlestore:=3+3+blen;
else
singlestore:=$ffff; {we don't encode this form for >6}
end;
repstore:=REPMOVSbyteCost+blen;
if singlestore < repstore
then sliceEncodedSize:=singlestore
else sliceEncodedSize:=repstore;
end;
procedure optimizeDeltas;
{
We optimize the delta list for size primarily and speed secondarily.
(Speedups are 99% a side-effect of optimizing for size.) Run through the
delta list in multiple passes and phases to:
- Combine multiple slices if they are within X distance of each other, because
the setup for a rep movs is 4 code bytes and 4 data bytes. 4+4 means, if next
slice is 8 or less bytes away, just combine it with the current slice. Frame
514 of the diff_preview_cga.avs shows why this is necessary.
- Extract runs out of long slices. Runs are faster to "replay" then slices,
and are incredibly compact which saves us disk I/O. (note to self: Make sure
to check thresholds to ensure run is actually a win. Consider runs "immune"
from subsequent split/join operations.)
- Truncate any delta that exceeds our maximum slice or run replay time into
one that is just under the limit, and its remainder.
- If requested by the user, eliminate single-pixel changes and very small
deltas to reduce the amount of data we need to encode, at a small visual
quality penalty. (We refer to this as "pixel shaving" and "delta shaving".)
}
var
maxFrameCycles:real;
maxFrameBytes:word;
procedure FindHiddenRuns;
{Identify runs inside deltas, and split runs out of them.
This is a *massive* size win and should never be skipped.}
var
runVal:byte;
runCount,runStartOfs,runEndOfs:word;
didx:integer;
w:word;
bp:^byte;
changed:boolean;
dtmp,dtmp1,dtmp2,dtmp3:pdelta;
procedure resetRunCounting;
begin
runVal:=bp^;
runCount:=1;
runstartofs:=word(bp);
runendofs:=runstartofs;
end;
begin
if debug>1 then stdout('Identifying runs');
repeat
changed:=false;
for didx:=0 to deltas^.count-1 do begin
dtmp:=pdelta(deltas^.at(didx));
{if already a run, or length doesn't qualify for being one, skip it}
if dtmp^.dtype=run then continue;
if dtmp^.blength<minRunLength then continue;
{examine slice to see if there is a viable run inside it}
bp:=nextframe;
word(bp):=dtmp^.startofs;
resetRunCounting;
{examine every byte of the slice, counting # of repeated bytes. Every
time byte value changes, check the count of the previous run and split
if a viable run is found.}
for w:=dtmp^.startofs to dtmp^.endofs+1 {+1 handle sr cases} do begin
word(bp):=w;
if bp^=runVal then begin
runendofs:=w;
runcount:=runendofs-runstartofs+1;
end else begin
if runCount>=minRunLength then begin
{create delta containing only the run, then create one or two
deltas with the remaining slice information. Need to handle
three cases where run could be hiding: rs, srs, sr}
if debug>3 then stdout('Creating a run:');
if runstartofs=dtmp^.startofs {rs} then begin
dtmp1:=new(pdelta,init(dtmp^.startofs,runendofs));
if dtmp1^.dtype<>run then fatalerr('Run was not entered as run?');
dtmp2:=new(pdelta,init(runendofs+1,dtmp^.endofs));
deltas^.insert(dtmp1);
deltas^.insert(dtmp2);
deltas^.free(dtmp);
end else if runendofs=dtmp^.endofs {sr} then begin
dtmp1:=new(pdelta,init(dtmp^.startofs,runstartofs-1));
dtmp2:=new(pdelta,init(runstartofs,dtmp^.endofs));
if dtmp2^.dtype<>run then fatalerr('Run was not entered as run?');
deltas^.insert(dtmp1);
deltas^.insert(dtmp2);
deltas^.free(dtmp);
end else {srs} begin
dtmp1:=new(pdelta,init(dtmp^.startofs,runstartofs-1));
dtmp2:=new(pdelta,init(runstartofs,runendofs));
if dtmp2^.dtype<>run then fatalerr('Run was not entered as run?');
dtmp3:=new(pdelta,init(runendofs+1,dtmp^.endofs));
deltas^.insert(dtmp1);
deltas^.insert(dtmp2);
deltas^.insert(dtmp3);
deltas^.free(dtmp);
end;
changed:=true;
break; {out of this delta}
end;
resetRunCounting;
continue;
end;
end;
end;
until not changed;
end;
procedure SubdivideOversizeDeltas;
{break apart deltas that are too large for our available cycles.
Mark these "optimal size" deltas as "frozen".
Pay attention to run cycles vs. slice cycles.}
var
didx:integer;
changed:boolean;
dtmp,dtmp1,dtmp2,dtmp3:pdelta;
begin
repeat
changed:=false;
for didx:=0 to deltas^.count-1 do begin
dtmp:=pdelta(deltas^.at(didx));
if dtmp^.REPcycleCost>maxFrameCycles then begin
stdout('truncating oversize delta :'+deltaInfo(dtmp));
{create new delta with max length}
{determine max length in bytes; round down and make even}
if dtmp^.dtype=slice
then maxFrameBytes:=trunc(maxFrameCycles/REPMOVSWcycleCost)*2
else maxFrameBytes:=trunc(maxFrameCycles/REPSTOSWcycleCost)*2;
if maxFrameBytes>maxVChunk-frameByteOverhead-REPSTOSbyteCost-1
then maxFrameBytes:=maxVChunk-frameByteOverhead-REPSTOSbyteCost-1;
{round down to even number so that gigantic slices can REP MOVSW}
{Actually, we're not using this, but will later if we ever decide
to target 8086+}
{maxFrameBytes:=maxFrameBytes AND $FFFE;}
{Build two new deltas: one truncated to max, other with remainder.
"freeze" the max one, since there's no reason to process it further.}
dtmp1:=new(pdelta,init(dtmp^.startofs,dtmp^.startofs+maxFrameBytes-1));
dtmp1^.frozen:=true;
dtmp2:=new(pdelta,init(dtmp^.startofs+maxFrameBytes,dtmp^.endofs));
stdout('replacing with '+deltaInfo(dtmp1)+' and '+deltaInfo(dtmp2));
deltas^.insert(dtmp1);
deltas^.insert(dtmp2);
deltas^.free(dtmp);
changed:=true;
end;
end;
until not changed;
end;
procedure ShaveTinyDeltas;
{shave deltas or pixels. We do this before the combination step so that we
have less "stragglers" to compare and combine.}
{Because shaving will eventually leave "trails" behind, we opt not to do it
once every second. (Future enhancement: Allow user to specify keyframe
interval.)}
var
didx:integer;
dtmp:pdelta;
begin
if (numEncodedFrames mod round(sourcefps))<>0 then begin
{delta shaving:
We work from the end of the collection to the beginning, because
removing a delta shrinks the total number of items in the collection,
and the end has the sorted deltas we care about}
if shaveDeltas then begin
if debug>2 then stdout('Shaving deltas:');
for didx:=deltas^.count-1 downto 0 do begin
dtmp:=pdelta(deltas^.at(didx));
{remove any deltas with pixels under the threshold}
if (dtmp^.blength<shaveDeltaMinimum) then begin
if debug>1 then stdout('Shaving delta '+inttostr(didx)+deltaInfo(dtmp));
deltas^.AtFree(didx);
end
else break; {no point continuing if everything above us >1 byte}
end;
end;
{pixel shaving:
{check if shaving deltas first, since if we are, there is no point to
shaving pixels because there won't be any 1-byte deltas in the list.
# of changed pixels is calculated on insert into the list, not here.}
if shavePixels and not shaveDeltas then begin
if debug>2 then stdout('Shaving pixels:');
for didx:=deltas^.count-1 downto 0 do begin
dtmp:=pdelta(deltas^.at(didx));
{remove any deltas with pixels under the threshold}
if (dtmp^.blength=1) then begin
if (dtmp^.numPixelsChanged<shavePixelMinimum) then begin
if debug>2 then stdout('Shaving pixel delta '+inttostr(didx)+deltaInfo(dtmp));
deltas^.AtFree(didx);
end;
end
else break; {no point continuing if everything above us >1 byte}
end;
end;
end;
end;
procedure ConcatenateDeltas;
{concatenate geographically-close slices together:
We combine slices to save space in the code generation step. Meaning,
if it takes 5 bytes to plot a single byte, and a single byte is 5
or less bytes away from some other slice, we expand that slice to include
the single pixel. Same goes for slices: Since slice setup code is 7 bytes,
if we have two slices 7 or less bytes away from each other, it makes
sense to combine them. The end result changes more pixels onscreen, but
it takes less space and executes quicker.
Ignore frozen deltas, which are frozen for a good reason (already optimal).
Don't combine something if the result will exceed maximum frame time.
To greatly cut down on the search space, we transfer deltas into a data
structure that maps deltas near each other geographically so that they are
easier to navigate.}
var
didx:integer;
w:word;
dstarts:PDeltaStarts;
dcand1,dcand2,dcomb:pdelta;
begin
maxFrameBytes:=trunc(maxFrameCycles/REPMOVSWcycleCost)*2;
if maxFrameBytes>maxVChunk-frameByteOverhead-REPMOVSbyteCost-1
then maxFrameBytes:=maxVChunk-frameByteOverhead-REPMOVSbyteCost-1;
if combineDeltas and (deltas^.count>4) then begin
{copy deltas into a data structure that sorts by start offset so that
we can traverse deltas in spatial order}
dstarts:=new(PDeltaStarts,init(1024,32));
for w:=0 to deltas^.count-1 do dStarts^.insert(deltas^.at(w));
{we make assumptions based on everything being in the right
order, so we had better sanity-check our structure:}
for w:=0 to dstarts^.count-2 do begin
if pDelta(dstarts^.at(w))^.endofs >= pDelta(dstarts^.at(w+1))^.startofs
then fatalerr('Sorted structure not sorted?');
end;
didx:=0;
while (didx < dstarts^.count-2) do begin
dcand1:=dstarts^.at(didx);
if (dcand1^.dtype=run) or (dcand1^.frozen) then begin
inc(didx);
continue;
end;
dcand2:=dstarts^.at(didx+1);
if (dcand2^.dtype=run) or (dcand2^.frozen) then begin
inc(didx);
continue;
end;
{Early termination test thanks to spatial ordering. Slices must be
within certain distance to be combined; if we aren't worth combining
here, then we won't be for the rest of them either. Break and advance.}
if (dcand2^.startofs-dcand1^.endofs > maxCombineDistance) then begin
inc(didx);
continue;
end;
{sanity checks}
if (dcand1^.startofs=dcand2^.startofs) or (dcand1=dcand2) then begin
stderr('Attempted to compare duplicates: ');
stderr(deltainfo(dcand1)+' and '+deltainfo(dcand2));
fatalerr('Faith in programmer and/or data structure lost');
end;
{combination acceptance conditions:}
{there must be actual savings to combine them}
if (sliceEncodedSize(dcand1^.startofs,dcand2^.endofs)
< sliceEncodedSize(dcand1^.startofs,dcand1^.endofs)+sliceEncodedSize(dcand2^.startofs,dcand2^.endofs))
{is combined slice within maximum limits?}
and (dcand1^.blength+dcand2^.blength < maxFrameBytes)
then begin
if debug>1 then stdout('Combining '+deltaInfo(dcand1) +' and '+deltaInfo(dcand2));
{make a combined delta}
dcomb:=new(pdelta,init(dcand1^.startofs,dcand2^.endofs));
{remove old deltas from temp structure}
dstarts^.delete(dcand1);
dstarts^.delete(dcand2);
{destroy old deltas permanently}
deltas^.free(dcand1);
deltas^.free(dcand2);
{insert new combined delta into world and temp structure}
dstarts^.insert(dcomb);
deltas^.insert(dcomb);
end else begin
{not a candidate -- keep going}
inc(didx);
end;
end;
{empty out temp structure without harming delta objects}
dstarts^.deleteall;
dispose(dstarts,done);
end;
end;
begin
if debug>1 then stdout('Optimizing delta list');
{Deterine maximum cycle count for a frame we must not exceed. Include
a 1% margin to account for the setup code.}
maxFrameCycles:=cyclesPerFrame * 0.99;
FindHiddenRuns;
SubdivideOversizeDeltas;
ShaveTinyDeltas;
ConcatenateDeltas;
FindHiddenRuns;
ShaveTinyDeltas;
end;
procedure saveDeltas(fname:string);
{translate our opcode and data buffers into a single code segment, then
combine with audio data and dump to disk.
Align to sector sizes. Update packetIndex.
This routine assumes normalized pointers!}
var
outf:file;
pw:^word;
p:pointer;
numread,numwritten:word;
packetSize:word;
begin