forked from slothy-optimizer/pqmx
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathevaluation.py
724 lines (566 loc) · 28.4 KB
/
evaluation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
# Copyright (c) 2021 Arm Limited
# SPDX-License-Identifier: MIT
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import core
import core.asm
import core.rw
import core.regs
import mve
import mve.regs
import mve.rw
class Toom4EvaluationInput():
def __init__(self, base, sources, evaluations, striding=False):
self.asm = base
self.sources = sources
self.evaluations = evaluations
self.access_granularity = 32
self.access_multiplier = int(self.access_granularity / 16)
self.reader = {}
for i in range(0,4):
if not striding:
self.reader[i] = mve.rw.ReadWriteVector(self.asm, sources[i],
self.access_granularity,
name=f"limb[{i}]")
else:
self.reader[i] = mve.rw.Stride4VectorLoad(self.asm, 16,
i, sources[0], "Q0",
name=f"limb[{i}]")
self.writer = {}
for k in evaluations.keys():
self.writer[k] = mve.rw.ReadWriteVector(self.asm, evaluations[k],
self.access_granularity,
name=f"eval[{k}]")
class Toom4Evaluation():
def __init__(self, dim,
base=None,
padding=False,
striding=False,
layout="top",
out_of_place=False):
self.out_of_place = out_of_place
self.num_limbs = 4
self.limbsize = int(dim / self.num_limbs)
self.padding = padding
self.dim = dim
self.striding = striding
self.layout = layout
self.loop = False
if dim % (self.num_limbs * 8) != 0:
raise Exception("MVE-based Toom4 only supported for dimensions which are a multiple of 32.")
if not base:
self.asm = mve.regs.Allocator()
else:
self.asm = base
self.inputs = []
self.access_granularity = 32
self.access_multiplier = int(self.access_granularity / 16)
self.max_idx = self.limbsize//self.access_multiplier
self.step_size = 128 // self.access_granularity
def add_source(self, sources, evaluations):
new_input = Toom4EvaluationInput(self.asm, sources, evaluations,
striding=self.striding)
self.inputs.append(new_input)
# def _create_layout_classic(self):
# self.dst_increment = False
# source_markers = core.markers.ReadWriteMarkers(self.asm,
# self.input_size_bytes,
# self.access_granularity,
# self.src.name())
# yield from source_markers.alloc_registers_for_markers()
# if self.out_of_place:
# dst_markers = core.markers.ReadWriteMarkers(self.asm,
# self.num_evals * self.limb_size_bytes,
# self.access_granularity,
# self.dst.name())
# yield from dst_markers.alloc_registers_for_markers()
# index_of_evals = { '0' : 0,
# '-1/2' : 1,
# '+1/2' : 2,
# '-1' : 3,
# '+1' : 4,
# '+2' : 5,
# 'infty' : 6 }
# else:
# dst_markers = source_markers
# index_of_evals = { '-1/2' : 1,
# '+1/2' : 2,
# '-1' : 4, # Omit 3 to leave eval at infty in-place
# '+1' : 5,
# '+2' : 6 }
# def limb_offset_in_source(limb_index):
# return limb_index * self.limbsize // self.access_multiplier
# def eval_offset_in_source(pt):
# eval_index = index_of_evals[pt]
# return eval_index * self.limbsize // self.access_multiplier
# limb_markers = {}
# for limb in self.limbs:
# limb_markers[limb] = source_markers.get_shifted_marker_list(
# limb_offset_in_source(limb))
# sources = limb_markers
# eval_markers = {}
# for pt in self.evals:
# if not pt in index_of_evals.keys():
# continue
# eval_markers[pt] = dst_markers.get_shifted_marker_list(
# eval_offset_in_source(pt))
# self.eval_markers = eval_markers
# self.sources = sources
def _create_layout_striding_topbottom(self):
self.dst_increment = True
high_eval_base = core.regs.Reg(self.asm.gprs, str_name="high_eval")
yield from high_eval_base.alloc()
if self.layout == "top":
shift = self.num_limbs
elif self.layout == "bottom":
shift = -1 * ( self.num_evals - self.num_limbs )
high_eval_byte_shift = shift * self.limb_size_bytes
high_eval_offset = int(shift*self.limbsize / self.access_multiplier)
if self.out_of_place:
low_eval_base = self.dst
else:
low_eval_base = self.src
yield f"add {high_eval_base.name()}, {low_eval_base.name()}, #{high_eval_byte_shift}"
low_markers = core.markers.ReadWriteMarkers(self.asm, 0,
self.access_granularity,
low_eval_base.name(),
initial_shift=0)
high_markers = core.markers.ReadWriteMarkers(self.asm, 0,
self.access_granularity,
high_eval_base.name(),
initial_shift=0)
yield from low_markers.alloc_registers_for_markers()
yield from high_markers.alloc_registers_for_markers()
index_of_evals = { '0' : 0,
'-1/2' : 1,
'+1/2' : 2,
'-1' : 3,
'+1' : 4,
'+2' : 5,
'infty' : 6 }
eval_markers = {}
for pt in self.evals:
eval_markers[pt] = []
eval_idx = index_of_evals[pt]
if eval_idx < self.num_limbs:
stride = self.num_limbs
markers = low_markers
else:
eval_idx -= self.num_limbs
stride = len(self.evals) - self.num_limbs
markers = high_markers
for idx in range(0,self.max_idx,self.step_size):
new_markers = markers.get_shifted_marker_list(
stride * idx + self.step_size * eval_idx,
offset_real_idx = idx,
size = self.step_size)
eval_markers[pt] = eval_markers[pt] + new_markers
self.eval_markers = eval_markers
self.sources = [self.src.name()]
def _create_layout_standard(self,
padded_limb_size=None,
idx_within_limb=None):
if padded_limb_size == None:
padded_limb_size = self.limbsize
if idx_within_limb == None:
idx_within_limb = lambda x: x
self.dst_increment = False
if self.out_of_place:
dst_base = self.dst
index_of_evals = { '0' : 0,
'-1/2' : 1,
'+1/2' : 2,
'-1' : 3,
'+1' : 4,
'+2' : 5,
'infty' : 6 }
else:
if self.striding:
raise Exception("Invalid layout")
dst_base = self.src
index_of_evals = { '-1/2' : 1,
'+1/2' : 2,
'-1' : 4, # Omit 3 to leave eval at infty in-place
'+1' : 5,
'+2' : 6 }
markers = core.markers.ReadWriteMarkers(self.asm,
self.num_evals * 2 * padded_limb_size,
self.access_granularity,
dst_base.name(),
initial_shift=0)
yield from markers.alloc_registers_for_markers()
eval_markers = {}
for pt in self.evals:
if not pt in index_of_evals.keys():
continue
eval_idx = index_of_evals[pt]
eval_markers[pt] = []
for idx in range(0,self.max_idx * self.access_multiplier,
self.step_size * self.access_multiplier):
offset_within_limb = idx_within_limb(idx)
offset = eval_idx * padded_limb_size + offset_within_limb
new_markers = markers.get_shifted_marker_list(
offset // self.access_multiplier,
offset_real_idx=idx // self.access_multiplier,
size=self.step_size // self.access_multiplier)
eval_markers[pt] = eval_markers[pt] + new_markers
self.eval_markers = eval_markers
if self.striding:
self.sources = [self.src.name()]
else:
if self.out_of_place:
source_markers = core.markers.ReadWriteMarkers(self.asm,
self.input_size_bytes,
self.access_granularity,
self.src.name(),
initial_shift=0)
yield from source_markers.alloc_registers_for_markers()
else:
source_markers = markers
limb_markers = {}
def limb_offset_in_source(limb_index):
return limb_index * self.limbsize // self.access_multiplier
for limb in self.limbs:
limb_markers[limb] = source_markers.get_shifted_marker_list(
limb_offset_in_source(limb))
self.sources = limb_markers
def _create_layout(self):
if self.layout in [ "top", "bottom" ]:
yield from self._create_layout_striding_topbottom()
elif self.layout == "standard":
yield from self._create_layout_standard()
elif self.layout == "karatsuba_x1":
padded_limb_size = 3 * self.limbsize // 2
yield from self._create_layout_standard(padded_limb_size)
elif self.layout == "karatsuba_x2":
padded_limb_size = 9 * self.limbsize // 4
def idx_within_limb(idx):
if idx >= self.limbsize // 2:
idx += self.limbsize // 4
return idx
yield from self._create_layout_standard(padded_limb_size, idx_within_limb)
else:
raise Exception("Unknown memory layout")
def _free_layout(self):
# TODO
return
# if self.striding == True:
# high_eval_base.free()
# This function emits assembly for a standalone Toom4 evaluation function
#
# There are multiple options to consider regarding the layout of the
# limbs, the layout of the evaluations, and how they perhaps overlap.
#
# The code emitted by this function assumes that the Toom4 limbs --
# that is, the coefficients a,b,c,d of a + b X + c X^2 + d X^3 to
# evaluate -- are stored in a single contiguous buffer in the order
# a,b,c,d (low to high) and that this buffer has space to hold the
# seven evaluations at 0, -1/2, +1/2, -1, +1, +2, infty. It generates
# those evaluations in the stated order, overwriting the previous
# content of the limbs.
#
# This way, this function can be used as an in-place transformer
# between the original polynomial form and the 'evaluation form'
# (similar to time and frequency domain for the NTT, but with the
# difference that our transformation increases the size.)
def standalone(self, funcname):
self.src_reg = 'r0'
if self.out_of_place:
self.dst_reg = 'r1'
self.dst = core.regs.Reg(self.asm.gprs, str_name="dst")
yield from self.dst.alloc(self.dst_reg)
self.limbs = [0,1,2,3]
self.evals = ['0', '-1/2', '+1/2', '-1', '+1', '+2', 'infty']
self.num_evals = len(self.evals)
self.input_size_index = self.num_evals * self.limbsize
self.limb_size_bytes = 2 * self.limbsize
self.input_size_bytes = 2 * self.input_size_index
#
# Preamble
#
yield from core.asm.Snippets.license()
yield from core.asm.Snippets.autogen_warning()
yield from core.asm.Snippets.function_decl(funcname)
yield from core.asm.Snippets.function_header(funcname)
yield from core.asm.Snippets.save_gprs()
yield from core.asm.Snippets.save_vregs()
self.src = core.regs.Reg(self.asm.gprs, str_name="src")
yield from self.src.alloc(self.src_reg)
if self.striding and self.loop:
# Initialize counter for low-overhead loop
self.loop_ctr_lr = core.regs.Reg(self.asm.gprs)
yield from self.loop_ctr_lr.alloc(reg="r14")
self.loop_ctr_reg = core.regs.Reg(self.asm.gprs)
yield from self.loop_ctr_reg.alloc()
#
# Setup sources
#
yield from self._create_layout()
self.add_source(self.sources, self.eval_markers)
yield from self.evaluation()
self._free_layout()
if self.striding and self.loop:
self.loop_ctr_lr.free()
self.loop_ctr_reg.free()
self.src.free()
if self.out_of_place:
self.dst.free()
#
# Wrapup
#
yield from core.asm.Snippets.restore_vregs()
yield from core.asm.Snippets.restore_gprs()
yield from core.asm.Snippets.function_footer()
def evaluation(self):
input = {}
tmp = {}
for i in range(0,4):
tmp[i] = core.regs.Reg(self.asm.vregs, str_name=f"tmp[{i}]")
# List of static coefficients we'll need in the algorithm
coeffs = [1,2,3,7]
yield f"// Prepare GPRs for static coefficients {coeffs} needed during evaluation"
coeff_gpr = {}
for c in coeffs:
coeff_gpr[c] = core.regs.Reg(self.asm.gprs)
yield from coeff_gpr[c].alloc()
yield f"mov {coeff_gpr[c].name()}, #{c}"
# The coefficients a,b,c,d in the evaluations of a + b X + c X^2 + d X^3
# are themselves polynomials, so for each point of evaluation we need to
# form a linear combination of polynomials. We evaluate those linear
# combinations in chunks of 8 coefficients.
num_inputs = len(self.inputs)
last_idx = None
last_dst = None
inner_iteration = False
delayed_store = {}
def marker_register_has_changed(reg,shift):
for input in self.inputs:
for pt in input.evaluations.keys():
marker_list = input.evaluations[pt]
core.markers.ReadWriteMarkers.shift_register_in_marker_list(marker_list,reg,int(shift / 4))
for src_idx in range(0,num_inputs):
for idx in range(0,self.max_idx,self.step_size):
first_iteration = idx == 0 and src_idx == 0
cur_input = self.inputs[src_idx]
src = cur_input.reader
dst = cur_input.writer
# Look ahead at next iteration to allow pre-loading
# of some limb values. Distinguish future iterations
# for the same input and the crossover between the
# two inputs. Handling the latter saves only a few
# cycles, but why not...
if idx + self.step_size != self.max_idx:
last_iteration = False
next_idx = idx + self.step_size
next_input = cur_input
next_src = src
elif src_idx != num_inputs - 1:
last_iteration = False
next_idx = 0
next_input = self.inputs[src_idx+1]
next_src = next_input.reader
else:
last_iteration = True
cur_inner_iteration = not first_iteration and not last_iteration
if not inner_iteration and cur_inner_iteration:
first_inner_iteration = True
else:
first_inner_iteration = False
inner_iteration = cur_inner_iteration
if self.striding and self.loop:
# if not first_iteration and not last_iteration:
# continue
if last_iteration:
yield "le lr, loop_start"
yield "loop_end:"
if first_iteration:
# In all but the first iteration, we preload the input vectors
yield from src[0].load(idx)
yield from src[1].load(idx)
yield from src[2].load(idx)
if self.out_of_place and self.striding:
yield from src[3].load(idx, post_increment=True)
else:
yield from src[3].load(idx)
if self.striding and self.loop:
if inner_iteration:
if first_inner_iteration:
# Use low-overhead loops
yield f"mov {self.loop_ctr_reg.name()}, #{int(self.max_idx / self.step_size) - 2}"
yield f"wls lr, {self.loop_ctr_reg.name()}, loop_end"
yield "loop_start:"
else:
yield "nop"
# TODO: Find a better place for this
if idx == 0 and self.padding:
# In the first iteration, add padding in front of the evaluation blocks.
yield from tmp[0].alloc()
yield f"vmov.u16 {tmp[0].name()}, #0"
yield from dst[ '+1'].write_from( -int(8 / self.access_multiplier), tmp[0] )
yield from dst[ '-1'].write_from( -int(8 / self.access_multiplier), tmp[0] )
yield from dst[ '+1/2'].write_from( -int(8 / self.access_multiplier), tmp[0] )
yield from dst[ '-1/2'].write_from( -int(8 / self.access_multiplier), tmp[0] )
yield from dst[ '+2'].write_from( -int(8 / self.access_multiplier), tmp[0] )
yield from dst[ '0'].write_from( -int(8 / self.access_multiplier), tmp[0] )
yield from dst['infty'].write_from( -int(8 / self.access_multiplier), tmp[0] )
tmp[0].free()
def high_half_vreg(vreg):
return vreg in ["Q4","Q5","Q6","Q7"]
def is_vreg(vreg):
return lambda x:x==vreg
# yield "nop"
# tmp[0] = a + c
yield from tmp[0].alloc(constraint=high_half_vreg)
yield f"vadd.u16 {tmp[0].name()}, {src[0].reg(idx)}, {src[2].reg(idx)}"
# Late store from last iteration
if not first_iteration:
reg = delayed_store.pop('+2')
yield from last_dst['+2'].write_from(last_idx, reg)
reg.free()
# In all but the first iteration, we've preloaded all input vectors
if first_iteration:
yield from src[1].load(idx)
yield from src[3].load(idx)
# tmp[1] = b + d
yield from tmp[1].alloc(constraint=high_half_vreg)
yield f"vadd.u16 {tmp[1].name()}, {src[1].reg(idx)}, {src[3].reg(idx)}"
# Late store from last iteration
if not first_iteration:
reg = delayed_store.pop('+1/2')
yield from last_dst['+1/2'].write_from(last_idx, reg)
reg.free()
#
# Finalize evaluation at -1
#
# tmp[2] = eval[-1][idx] = a - b + c - d
yield from tmp[2].alloc(constraint=high_half_vreg)
yield f"vsub.u16 {tmp[2].name()}, {tmp[0].name()}, {tmp[1].name()}"
# Defer storing of evaluation at -1, and instead
# perform last late store from previous iteration
if not first_iteration:
reg = delayed_store.pop('-1/2')
yield from last_dst['-1/2'].write_from(last_idx, reg)
reg.free()
# tmp[0] = (a + c) + 3a = 4a + c
yield f"vmla.s16 {tmp[0].name()}, {src[0].reg(idx)}, {coeff_gpr[3].name()}"
# Store evaluation at -1
yield from dst['-1'].write_from(idx, tmp[2])
#
# Finalize evaluation at +1
#
# tmp[2] = eval[+1][idx] = a + b + c + d = eval[-1][idx] + 2*(b+d)
yield f"vmla.s16 {tmp[2].name()}, {tmp[1].name()}, {coeff_gpr[2].name()}"
# Logically unrelated, but interleave storing of eval[0] = a with the two VADDs
# to keep memory and arithmetic operations alternating.
# Omit this if no offset is provided for this index.
if '0' in dst.keys():
if not self.striding:
yield from dst['0'].write_from(idx, src[0].reg(idx))
else:
if self.dst_increment:
yield from dst['0'].write_from(idx, src[0].reg(idx),
post_increment=64,
post_increment_callback=marker_register_has_changed)
else:
yield from dst['0'].write_from(idx, src[0].reg(idx))
yield from src[0].release(idx)
# tmp[1] = (b + d) + 3b = 4b + d
yield f"vmla.s16 {tmp[1].name()}, {src[1].reg(idx)}, {coeff_gpr[3].name()}"
# Logically unrelated, but interleave storing of eval[infty] = d with the vector arithmetic
# to keep memory and arithmetic operations alternating.
# Omit this if no offset is provided for this index.
if 'infty' in dst.keys():
yield from dst['infty'].write_from(idx, src[3].reg(idx))
# ( a + b + c + d ) + b = a + 2*b + c + d
yield from tmp[3].alloc(constraint=high_half_vreg)
yield f"vadd.u16 {tmp[3].name()}, {tmp[2].name()}, {src[1].reg(idx)}"
yield from src[1].release(idx)
# Store evaluation at +1
if not self.striding or not self.dst_increment:
yield from dst['+1'].write_from(idx, tmp[2].name())
else:
yield from dst['+1'].write_from(idx, tmp[2].name(),
post_increment=48,
post_increment_callback=marker_register_has_changed)
# ( a + 2*b + c + d ) + 3*c = a + 2*b + 4*c + d
yield f"vmla.s16 {tmp[3].name()}, {src[2].reg(idx)}, {coeff_gpr[3].name()}"
yield from src[2].release(idx)
tmp[2].free()
yield from tmp[2].alloc(move=tmp[3])
### At this point, we're using only src[3], tmp[0-2]
### We can start preloading the next four vectors for the next iteration
#
# Finalize evaluation at +2
#
# a+2*b+4*c+d + 7*c = a + 2*b + 4*c + 8*d
yield f"vmla.s16 {tmp[2].name()}, {src[3].reg(idx)}, {coeff_gpr[7].name()}"
yield from src[3].release(idx)
if not last_iteration:
yield from next_src[0].load(next_idx)
# tmp[0] = 8a + 2c
yield f"vshl.u16 {tmp[0].name()}, {tmp[0].name()}, #1"
# We defer storing the evaluations at +2, -1/2 and +1/2 to the next
# iteration and instead use the LSU to preload the input for the
# next iteration.
if not last_iteration:
yield from next_src[1].load(next_idx)
delayed_store['+2'] = core.regs.Reg(self.asm.vregs, str_name=f"delayed[+2]")
yield from delayed_store['+2'].alloc(move=tmp[2])
else:
# Store evaluation at +2
yield from dst['+2'].write_from(idx, tmp[2])
tmp[2].free()
#
# Finalize evaluation at -1/2
#
# tmp[3] = (8a + 2c) - (4b + d) = eval[-1/2][idx]
yield from tmp[3].alloc(constraint=high_half_vreg)
yield f"vsub.u16 {tmp[3].name()}, {tmp[0].name()}, {tmp[1].name()}"
if not last_iteration:
yield from next_src[2].load(next_idx)
delayed_store['-1/2'] = core.regs.Reg(self.asm.vregs, str_name=f"delayed[-1/2]")
yield from delayed_store['-1/2'].alloc(move=tmp[3])
else:
# Store evaluation at -1/2
yield from dst['-1/2'].write_from(idx, tmp[3])
tmp[3].free()
#
# Finalize evaluation at +1/2
#
# tmp[0] = 8a - 4b + 2c - d + 2*(4b + d) = 8a + 4b + 2c + d = eval[1/2][idx]
yield f"vadd.u16 {tmp[0].name()}, {tmp[0].name()}, {tmp[1].name()}"
tmp[1].free()
if not last_iteration:
if self.out_of_place and self.striding:
yield from next_src[3].load(next_idx, post_increment=True)
else:
yield from next_src[3].load(next_idx)
delayed_store['+1/2'] = core.regs.Reg(self.asm.vregs, str_name=f"delayed[+1/2]")
yield from delayed_store['+1/2'].alloc(move=tmp[0])
else:
# Store evaluation at +1/2
yield from dst['+1/2'].write_from(idx, tmp[0])
tmp[0].free()
# If we're not in the last iteration, we're carrying three late-store
# vector registers maintained in delayed_store into the next iteration.
yield f"// End of iteration"
last_dst = dst
last_idx = idx
for c in coeffs:
coeff_gpr[c].free()
def get_code(self):
for asm_line in self.evaluation():
print(asm_line)