-
Notifications
You must be signed in to change notification settings - Fork 2
/
hotdog_ops.S
1295 lines (1145 loc) · 45 KB
/
hotdog_ops.S
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* Copyright (c) 2005-06 James Jacobsson, Adam Johnston, Joshua Oreman, and David Carne.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* Redistributions of source code must retain the above copyright notice, this list
* of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice, this
* list of conditions and the following disclaimer in the documentation and/or other
* materials provided with the distribution.
* Neither the name of the organization nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior
* written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
* SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
* OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
* TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#if defined(IPOD) || defined(CHUMBY)
.section .text
__ASM_functions_list:
.long _HD_ARM_Convert16
.long _HD_ARM_Convert2
.long _HD_ARM_LowerBlit_ScaleBlend
.long _HD_ARM_LowerBlit_Blend
.long _HD_ARM_LowerBlit_Fast
.long _HD_ARM_Update5G
.long _HD_ARM_ClearScreen
.long _HD_ARM_UpdatePhoto
.long 0
.globl _HD_ARM_Setup
_HD_ARM_Setup:
str r4, [sp, #-4]!
@@ Setup stuff for code copy.
ldr r0, =__ASM_functions_start
ldr r1, =0x40000030
ldr r2, =__ASM_functions_end
sub r3, r1, r0 @ r3 = offset
@@ Copy ASM funcs to IRAM + 0x30.
1: ldr r4, [r0], #4
str r4, [r1], #4
cmp r0, r2
blo 1b
@@ Put stubs in at the old addresses to jump to the new ones.
adr r0, __ASM_functions_list
1: ldr r1, [r0], #4 @ load a funcptr and inc funcptrptr
cmp r1, #0 @ is it the 0 at the end?
beq 2f @ if so, break out
add r2, r1, r3 @ r2 = address of func in iram
ldr r4, =0xe51ff004 @ r3 = encoding of `ldr pc, [pc, #-4]' (load pc from next word)
str r4, [r1], #4 @ store instr at func and inc funcptr
str r2, [r1], #4 @ store address of where-to-jump at func + 4 and inc funcptr
b 1b @ keep looping
2: @@ Return.
ldr r4, [sp], #4
bx lr
.pool
__ASM_functions_start:
/*
* This file contains optimized ARM ASM versions of several common operations.
* Cycle counts are specified for all instructions in inner loops
* that do not complete in one cycle.
*/
@@ Convert r2 4-byte AARRGGBB pixels in [r0] to 2-byte RGB pixels in [r1].
@@ void _HD_ARM_Convert16 (uint32 *buffer, uint16 *fb, int npix)
.globl _HD_ARM_Convert16
_HD_ARM_Convert16:
stmdb sp!, {r4-r12, lr}
ldr r4, =0x0000F800 @ R mask - 5 top bits.
ldr r5, =0x000007E0 @ G mask - 6 middle bits.
ldr r6, =0x0000001F @ B mask - 5 bottom bits.
mov r8, r0 @ r8 = src
mov r9, r1 @ r9 = dst
mov r10, r2 @ r10 = count
@@ Main loop. 20 cycles per 2 pixels = 10 cycles/pix.
1: ldmia r8!, {r0, r11-r12, r14} @ Load four pixels to r0, r11, r12, r14. 1N + 1I + 4S cyc.
and r1, r4, r0, lsr #8 @ Red pixel, shift, mask, and store in r1
and r2, r5, r0, lsr #5 @ Green pixel, shift, mask, store in r2
and r3, r6, r0, lsr #3 @ Blue pixel, shift, mask, and store in r3
orr r1, r2, r1 @ Red |= Green
orr r0, r1, r3 @ Store Red|Green|Blue in r0. r0 = first pixel, converted.
@@ Do it again for the next pixel.
and r1, r4, r11, lsr #8 @ Red pixel, shift, mask, and store in r1
and r2, r5, r11, lsr #5 @ Green pixel, shift, mask, store in r2
and r3, r6, r11, lsr #3 @ Blue pixel, shift, mask, and store in r3
orr r1, r2, r1 @ Red |= Green
orr r11, r3, r1 @ r11 = Red|Green|Blue = second pixel, converted.
@@ And again...
and r1, r4, r12, lsr #8
and r2, r5, r12, lsr #5
and r3, r6, r12, lsr #3
orr r1, r2, r1
orr r12, r3, r1
@@ And again!
and r1, r4, r14, lsr #8
and r2, r5, r14, lsr #5
and r3, r6, r14, lsr #3
orr r1, r2, r1
orr r14, r3, r1
@@ Combine first+second, third+fourth.
orr r0, r0, r11, lsl #16 @ r0 = (second << 16) | first
orr r12, r12, r14, lsl #16 @ r12 = (fourth << 16) | third
subs r10, r10, #4 @ Step counter down
stmia r9!, {r0, r12} @ Save these four pixels. 1S + 2N cyc.
bne 1b @ Keep looping until counter = 0. 3 cyc.
@@ Return
ldmia sp!, {r4-r12, pc}
.pool
@@ Convert r2 4-byte AARRGGBB pixels in [r0] to 2-bit Y pixels in [r1].
@@ void _HD_ARM_Convert2 (uint32 *buffer, uint8 *fb2bpp, int npix)
.globl _HD_ARM_Convert2
_HD_ARM_Convert2:
stmdb sp!, {r4-r12, lr}
@@ The algorithm we're using is Y = B/8 + G/2 + G/8 + R/4.
@@ Register usage:
@@ r0 Red component
@@ r1 Green component
@@ r2 Blue component
@@ r3 Byte currently being built
@@ r4-7 Four pixels we loaded
@@ r8 #0x3f
@@ r9 #0x1f
@@ r10 #0x7f
@@ r11 src pixptr
@@ r12 dst pixptr
@@ r14 npix
mov r8, #0x3f @ R mask (/4)
mov r9, #0x1f @ G mask 1 (/8) and B mask
mov r10, #0x7f @ G mask 2 (/2)
mov r11, r0 @ src pixptr
mov r12, r1 @ dst pixptr
mov r14, r2 @ npix
@@ Main loop. 46 cycles per 4 pixels ~= 11.5 cycles/pix.
1: ldmia r11!, {r4-r7} @ Load four pixels. 6 cyc.
mov r3, #0 @ r3 = byte we're building
@@ First pixel (r4). 8 cyc.
and r0, r8, r4, lsr #18 @ r0 = red/4
and r1, r10, r4, lsr #9 @ r1 = green/2
and r2, r9, r4, lsr #3 @ r2 = blue/8
@ We've now gotten everything we need out of r4, so we'll
@ now use it for a scratch register.
add r4, r1, r1, lsr #2 @ r4 = green/2 + green/8
add r4, r4, r0 @ + red/4
add r4, r4, r2 @ + blue/8
@ r0-r2 are free now too.
mov r0, r4, lsr #6 @ r0 = Y>>6
rsb r3, r0, #3 @ 0 is white
@@ r4 is now freed up for temp stuff.
@@ Let's use it to store #3, so we can do the
@@ r0 = 3 - (Y >> 6) in one instruction.
mov r4, #3
@@ Second pixel (r5). 8 cyc.
and r0, r8, r5, lsr #18 @ r0 = red/4
and r1, r10, r5, lsr #9 @ r1 = green/2
and r2, r9, r5, lsr #3 @ r2 = blue/8
@ r5 is now scratch.
add r5, r1, r1, lsr #2 @ r5 = green/2 + green/8
add r5, r5, r0 @ + red/4
add r5, r5, r2 @ + blue/8
@ r0-r2 are free now too.
sub r0, r4, r5, lsr #6 @ r0 = 3 - (Y>>6)
orr r3, r3, r0, lsr #2 @ put this pixel into spot #2 of r3
@@ Third pixel (r6). 8 cyc.
and r0, r8, r6, lsr #18 @ r0 = red/4
and r1, r10, r6, lsr #9 @ r1 = green/2
and r2, r9, r6, lsr #3 @ r2 = blue/8
add r5, r1, r1, lsr #2 @ r6 = green/2 + green/8
add r5, r5, r0 @ + red/4
add r5, r5, r2 @ + blue/8
sub r0, r4, r5, lsr #6 @ r0 = 3 - (Y>>6)
orr r3, r3, r0, lsr #4 @ put this pixel into spot #3 of r3
@@ Fourth pixel (r7). 8 cyc.
and r0, r8, r7, lsr #18 @ r0 = red/4
and r1, r10, r7, lsr #9 @ r1 = green/2
and r2, r9, r7, lsr #3 @ r2 = blue/8
add r5, r1, r1, lsr #2 @ r6 = green/2 + green/8
add r5, r5, r0 @ + red/4
add r5, r5, r2 @ + blue/8
sub r0, r4, r5, lsr #6 @ r0 = 3 - (Y>>6)
orr r3, r3, r0, lsr #6 @ put this pixel into spot #4 of r3
@@ Dec the counter, store the 4 pixels, and loop.
subs r14, r14, #4 @ dec the counter
strb r4, [r12], #1 @ store the byte. 2 cyc.
bne 1b @ loop if counter != 0. 3 cyc.
@@ Return
ldmia sp!, {r4-r12, pc}
@@ _HD_ARM_LowerBlit_AS - the core of the ScaleBlendClip function.
@@ This is NOT intended to be called from user code!
@@ void _HD_ARM_LowerBlit_AS (hd_surface src, uint32 fp_initial_ix, uint32 fp_initial_iy,
@@ uint32 fp_step_x, uint32 fp_step_y,
@@ hd_surface dest, uint32 startx, uint32 deltx, uint32 starty,
@@ uint32 delty, uint8 opacity)
.globl _HD_ARM_LowerBlit_ScaleBlend
_HD_ARM_LowerBlit_ScaleBlend_StoreSP: .long 0
_HD_ARM_LowerBlit_ScaleBlend:
@@ This function takes 51 + 53*h + (27+(5*(opacity != 0xff)))*w*h cycles to execute.
stmfd sp!, {r0-r12, lr} @ 14 registers, 15 cyc.
@@ src at [sp, #0]
@@ fp_initial_ix at [sp, #4]
@@ fp_initial_iy at [sp, #8] <-- updated as fp_iy
@@ fp_step_x at [sp, #12]
@@ fp_step_y at [sp, #56]
@@ dest at [sp, #60]
@@ startx at [sp, #64]
@@ deltx at [sp, #68]
@@ starty at [sp, #72] <-- updated as y
@@ delty at [sp, #76] <-- updated
@@ opacity at [sp, #80]
@@ fp_initial_iy and starty are only accessed once.
@@ fp_iy is only accessed once per row, so we'll
@@ keep it on the stack at [sp, #8] (replacing fp_initial_iy).
@@ r1 = src+imgOff
@@ r2 = deltx
@@ r3 = fp_ix
@@ r4 = fp_step_x
@@ r5 = dest+buffOff+x
@@ r0, r6, r7, r8, r11 (5 regs) = scratch
@@ r9 = 0x00ff00ff
@@ r10 = 0x008000ff
@@ r14 = opacity
@@ Increment the src and dest pointers on the stack by 8,
@@ so you can do e.g. [<srf>, <row>, lsl #2] to load a
@@ row offset in one instruction. 12 cyc.
ldr r1, [sp, #0]
add r1, r1, #8
str r1, [sp, #0]
ldr r5, [sp, #60]
add r5, r5, #8
str r5, [sp, #60]
ldr r4, [sp, #12] @ r4 = fp_step_x. 3 cyc.
@@ Setup. 7 cyc.
mov r9, #0xff
orr r9, r9, #0xff0000
bic r10, r9, #0x7f0000 @ r10 = 0x008000ff
ldr r14, [sp, #80]
@@ Outer loop. 48 + 27*w cycles.
1:
@@ Load y, figure out buffOff, increment y and store it back. 13 cyc.
ldr r5, [sp, #60] @ r5 = dest + 2
ldr r7, [sp, #72] @ r7 = y
add r6, r7, #1 @ inc it -> r6
str r6, [sp, #72] @ and store back
ldr r7, [r5, r7, lsl #2] @ r7 = offset of starty'th row
sub r7, r7, #2 @ compensate for the +2
add r5, r5, r7, lsl #2 @ r5 = pointer to row in dest
@@ Load fp_iy, figure out imgOff, increment fp_iy and store it back. 18 cyc.
ldr r1, [sp, #0] @ r1 = src + 2
ldr r7, [sp, #8] @ r7 = fp_iy
mov r8, r7, lsr #16 @ r8 = row in image
ldr r8, [r1, r8, lsl #2] @ r8 = offset of row in src
sub r8, r8, #2 @ compensate for the +2
add r1, r1, r8, lsl #2 @ r1 = pointer to row in src
ldr r8, [sp, #56] @ r8 = fp_step_y
add r7, r7, r8 @ fp_iy += fp_step_y
str r7, [sp, #8] @ and store it
@@ Set up some stuff for the inner loop. 9 cyc.
ldr r3, [sp, #4] @ r3 = fp_initial_ix
ldr r6, [sp, #64] @ r6 = startx
ldr r2, [sp, #68] @ r2 = deltx
add r5, r5, r6, lsl #2 @ r5 += startx
@@ Store SP so we can use it for calculations. 2N cyc.
str sp, _HD_ARM_LowerBlit_ScaleBlend_StoreSP
@@ Pick the no-opacity or the opacity version. 2-4 cyc.
cmp r14, #0xff
bne 7f
@@ Inner loop. 28 cycles.
@@ Local labels:
@@ 1: beginning of outer loop
@@ 2: beginning of no-opacity varied-alpha inner loop
@@ 3: jump point into no-opacity varied-alpha inner loop from simple-alpha inner loop
@@ 4: beginning of simple-alpha inner loop
@@ 5: jump point into simple-alpha inner loop from no-opacity varied-alpha inner loop
@@ 6: update of simple-alpha inner loop
@@ 7: beginning of opacity inner loop
@@ 6: update of opacity inner loop (yes, two 6's)
@@ 8: after end of opacity inner loop
@@ 9: used by ablend macro
.macro ablend src, dst, opac=0
@@ Blends src on dst and stores the result in dst.
@@ Clobbers src, r0, r8, r11.
@@ Requires r14 = opacity unless opac=0, r9 = 0xff00ff, r10 = 0x8000ff.
@@ Time: 4 cyc for alpha=0 opac=0,
@@ 7 cyc for alpha=255 opac=0,
@@ 15 cyc for alpha=0 opac=1,
@@ 17 cyc for all other alphas opac=0,
@@ 26 cyc for any nonzero alpha opac=1.
@@ On exit from the macro, Z flag is set iff alpha was 0 or 255.
.if \opac
@@ If we have opacity to deal with, we need to scale down *all* parts of src by opac/256.
@@ This is almost exactly like the blend loop below.
and r0, r9, \src, lsr #8 @ r0 = (src >> 8) & 0x00ff00ff
mla r11, r0, r14, r9 @ r11 = r0 * opac + 0x00ff00ff
bic r11, r11, r9 @ r11 &= 0xff00ff00
and r0, r9, \src @ r0 = src & 0x00ff00ff
mla r8, r0, r14, r10 @ r8 = r0 * opac + 0x008000ff
bic r8, r8, r9 @ r8 &= 0xff00ff00
add \src, r11, r8, lsr #8 @ src = (r8 >> 8) + r11
.endif
movs r8, \src, lsr #24 @ r8 = alpha
beq 9f @ skip everything if alpha is 0
rsbs r8, r8, #0xff @ r8 = 255 - alpha
.if !\opac
@@ Don't do this for non-0xff opacity, because it will scale 0xff alphas
@@ down to something non-0xff.
moveq \dst, \src @ if alpha was 255, just copy the pixel
beq 9f @ and skip the rest
.endif
@ r7 is now accumulator
@@ Use r0, so we can put r8 last in mla so it can take only 3 cycles.
and r0, r9, \dst, lsr #8 @ r0 = (idst >> 8) & 0x00ff00ff
mla r11, r0, r8, r10 @ r11 = r0 * alpha + 0x00800080
bic r11, r11, r9 @ r11 &= 0xff00ff00
add \src, \src, r11 @ r7 += that whole thing.
and r11, r9, \dst @ r11 = idst & 0x00ff00ff
mlas \dst, r11, r8, r10 @ r6 = r11 * alpha + 0x00800080. S so Z=0.
bic \dst, \dst, r9 @ r6 &= 0xff00ff00
add \dst, \src, \dst, lsr#8 @ r7 += that whole thing >> 8
9:
.endm
.macro ablndne src, dst
@@ Blends src on dst and stores the result in dst.
@@ All instructions are conditionalized `ne'.
@@ Does not support opacity.
@@ Requirements and clobbers are the same as for ablend.
@@ Always takes 14 S+I cyc.
movne r8, \src, lsr #24
rsbne r8, r8, #0xff
andne r0, r9, \dst, lsr #8
mlane r11, r0, r8, r10
bicne r11, r11, r9
addne \src, \src, r11
andne r11, r9, \dst
mlane \dst, r11, r8, r10
bicne \dst, \dst, r9
addne \dst, \src, \dst, lsr #8
.endm
@@ Simple no-overall-opacity 4x unrolled blend loop.
2: ldmia r5, {r6, r12, r13, r14} @ load dst #1, #2, #3, #4 4S 1N 1I
mov r7, r3, lsr #16 @ figure src #1 offset 1S
ldr r7, [r1, r7, lsl #2] @ load src #1 1S 1N 1I
ablend r7, r6 @ blend #1 17S
beq 5f @ go to the easy-stuff loop 1S (unless taken)
3: add r3, r3, r4 @ update r3 1S
subs r2, r2, #1 @ update counter 1S
movne r7, r3, lsr #16 @ figure src #2 offset 1S
ldrne r7, [r1, r7, lsl #2] @ load src #2 1S 1N 1I
ablndne r7, r12 @ blend #2 14S
addne r3, r3, r4 @ update r3 1S
subnes r2, r2, #1 @ update counter 1S
movne r7, r3, lsr #16 @ figure src #3 offset 1S
ldrne r7, [r1, r7, lsl #2] @ load src #3 1S 1N 1I
ablndne r7, r13 @ blend #3 14S
addne r3, r3, r4 @ update r3 1S
subnes r2, r2, #1 @ update counter 1S
movne r7, r3, lsr #16 @ figure src #4 offset 1S
ldrne r7, [r1, r7, lsl #2] @ load src #4 1S 1N 1I
ablndne r7, r14 @ blend #4 14S
addne r3, r3, r4 @ update r3 1S
subnes r2, r2, #1 @ update counter 1S
@@ We store all 4 pixels, even if there weren't 4 left,
@@ because the ones we shouldn't be touching weren't
@@ touched - they're the same as they were when we
@@ loaded them above, and it's much quicker to do a
@@ block store than 4 compares and individual stores.
stmia r5!, {r6, r12, r13, r14} @ store the four pixels 3S 2N
bne 2b @ keep looping while some left 2S 1N
@@ Cycle total per 4px: 83S 8N 5I 104
@@ 26 cycles per pixel.
@@ End of first inner loop.
mov r14, #0xff @ restore the 0xff opacity, since r14 got clobbered.
b 8f @ skip the other 2 versions - 3 cyc.
@@ This is the `easy-stuff' loop, taken if the first pixel was blended easily,
@@ because probably the others will be too. This speeds up long runs of
@@ all fully-opaque or fully-transparent pixels.
4: ldmia r5, {r6, r12, r13, r14} @ load dst #1, #2, #3, #4 4S 1N 1I
mov r7, r3, lsr #16 @ figure src #1 offset 1S
ldr r7, [r1, r7, lsl #2] @ load src #1 1S 1N 1I
ablend r7, r6 @ blend #1 4-7S
bne 3b @ go to the general loop if nec 1S (unless taken)
5: add r3, r3, r4 @ update r3 1S
subs r2, r2, #1 @ update counter 1S
beq 6f @ skip if we're done 1S
mov r7, r3, lsr #16 @ figure src #2 offset 1S
ldr r7, [r1, r7, lsl #2] @ load src #2 1S 1N 1I
ablend r7, r12 @ blend #2 4-7S
add r3, r3, r4 @ update r3 1S
subs r2, r2, #1 @ update counter 1S
beq 6f @ skip if we're done 1S
mov r7, r3, lsr #16 @ figure src #3 offset 1S
ldr r7, [r1, r7, lsl #2] @ load src #3 1S 1N 1I
ablend r7, r13 @ blend #3 4-7S
add r3, r3, r4 @ update r3 1S
subs r2, r2, #1 @ update counter 1S
beq 6f @ skip if we're done
mov r7, r3, lsr #16 @ figure src #4 offset 1S
ldr r7, [r1, r7, lsl #2] @ load src #4 1S 1N 1I
ablend r7, r14 @ blend #4 4-7S
add r3, r3, r4 @ update r3 1S
subs r2, r2, #1 @ update counter 1S
6: stmia r5!, {r6, r12, r13, r14} @ store the four pixels 3S 2N
bne 4b @ keep looping while some left 2S 1N
@@ Cycle total per 4px: 40-52S 8N 5I 53-65
@@ 13 cycles per fully transparent pixel, 16 per fully opaque one.
@@ End of second inner loop.
mov r14, #0xff @ restore the 0xff opacity, since r14 got clobbered.
b 8f @ skip the other version - 3 cyc.
@@ Inner loop 2 - opacity version. 4-5 extra cycles per loop iteration.
7: ldmia r5, {r6, r12, r13} @ load 3 pixels 3S 1N 1I
mov r7, r3, lsr #16 @ figure src #1 offset 1S
ldr r7, [r1, r7, lsl #2] @ load src #1 1S 1N 1I
ablend r7, r6, 1 @ blend #1 26S
add r3, r3, r4 @ update r3 1S
subs r2, r2, #1 @ update counter 1S
beq 6f @ skip if we're done 1S
mov r7, r3, lsr #16 @ figure src #2 offset 1S
ldr r7, [r1, r7, lsl #2] @ load src #2 1S 1N 1I
ablend r7, r12, 1 @ blend #2 26S
add r3, r3, r4 @ update r3 1S
subs r2, r2, #1 @ update counter 1S
beq 6f @ skip if we're done 1S
mov r7, r3, lsr #16 @ figure src #3 offset 1S
ldr r7, [r1, r7, lsl #2] @ load src #3 1S 1N 1I
ablend r7, r13, 1 @ blend #3 26S
add r3, r3, r4 @ update r3 1S
subs r2, r2, #1 @ update counter 1S
6: stmia r5!, {r6, r12, r13} @ store the three pixels 2S 2N
bne 7b @ keep looping while some left 2S 1N
@@ Cycle total per 3px: 99S 7N 4I 117
@@ 39 cycles per pixel.
@@ End of third inner loop.
@@ Load SP. 1S+1N+1I cyc.
8: ldr sp, _HD_ARM_LowerBlit_ScaleBlend_StoreSP
@@ Test. 9 cyc.
ldr r6, [sp, #76] @ r6 = delty
subs r6, r6, #1 @ dec it
str r6, [sp, #76] @ and store it back
bne 1b @ loop while != 0
@@ End of outer loop.
ldmia sp!, {r0-r12, pc} @ return, 18 cyc.
@@ _HD_ARM_LowerBlit_Blend - Non-scaling blend blit.
@@ Kinda-sorta fast. Not really, compared to _Fast.
@@ void _HD_ARM_LowerBlit_Blend (hd_surface src, uint32 sx, uint32 sy,
@@ hd_surface dest, uint32 dx, uint32 dy, uint32 dw, uint32 dh,
@@ uint8 opacity)
@@ ->NOTE<-: This function is actually SLOWER than ScaleBlend! It needs some optims!
.globl _HD_ARM_LowerBlit_Blend
_HD_ARM_LowerBlit_Blend:
stmdb sp!, {r4-r12, lr} @ save 10 regs, 2N + 9S cyc.
@@ Stack layout:
@@ dx at [sp, #40]
@@ dy at [sp, #44]
@@ dw at [sp, #48]
@@ dh at [sp, #52]
@@ opaci at [sp, #56]
@@ Register usage:
@@ r0 current pixel in src
@@ r1 current pixel in dest
@@ r2 width of the blitted region (dw)
@@ r3 address of dest(dx,dy+dh)
@@ r4 width of src (src[0])
@@ r5 width of dest (dest[0])
@@ r6 idst (alpha blit)
@@ r7 isrc, accumulator (alpha blit)
@@ r8 alpha (alpha blit)
@@ r9 0x00ff00ff (preloaded above)
@@ r10 0x00800080 (preloaded above)
@@ r11 scratch
@@ r12 number of pix left in this row
@@ r14 opacity
@@ r0 is already loaded with src - set it up to point to (sx,sy). 2N + 5S + 2I cyc.
ldr r4, [r0] @ load width. 1N+1I+1S
add r8, r0, #8 @ r8 = beginning of row offset pointers
ldr r8, [r8, r2, lsl #2] @ load offset of sy'th row. 1N+1I+1S
add r0, r0, r8, lsl #2 @ r0 = beginning of sy'th row
add r0, r0, r1, lsl #2 @ r0 = address of pixel (sx,sy)
@@ set r1 up to point to (dx,dy) in dest - currently r3=dest. 3N + 10S + 3I cyc.
ldr r5, [r3] @ load width. 1N+1I+1S
add r8, sp, #40 @ r8 = beginning of dx,dy,dw,dh on stack
ldmia r8, {r1, r2, r6, r7} @ r1=dx, r2=dy, (r3=dest), r6=dw, r7=dh. 1N + 1I + 4S.
add r8, r3, #8 @ r8 = beginning of row offset pointers
ldr r8, [r8, r2, lsl #2] @ load offset of dy'th row. 1N+1I+1S
add r3, r3, r8, lsl #2 @ r3 = beginning of dy'th row
add r1, r3, r1, lsl #2 @ r1 = address of pixel (dx,dy)
@@ set r3 up - this one's easy, it's quicker to do a mla here than
@@ load the row offset ptr. 2S + 2I cyc.
@@ also setup r2 (1 cyc) and r14 (3 cyc incl 1N) and constants (3 cyc)
mov r7, r7, lsl #2
mla r3, r7, r5, r1 @ r3 = (dtw * dh * 4) + r1
mov r2, r6
ldr r14, [sp, #56]
mov r9, #0xff @ r9 = 0xff
orr r9, r9, r9, lsl #16 @ r9 = 0xff00ff
bic r10, r9, r9, lsr #1 @ r10 = 0x800080
@@ Outer loop.
1: mov r12, r2
@@ Pick the no-opacity or the opacity version. 2-4 cyc.
cmp r14, #0xff
bne 3f
@@ Inner loop. 28 cycles.
2:
@@ Do the blend. 23 cyc. Thanks aegray!
@ r6 = idst
@ r7 = isrc, accumulator
@ r8 = alpha
@ r9 = 0x00ff00ff (preloaded above)
@ r10 = 0x00800080 (preloaded above)
@ r11 = scratch
ldr r6, [r1] @ r6 = idst, x++
ldr r7, [r0], #4 @ r7 = isrc
sub r8, r12, r7, lsr #24 @ r8 = 255-AA
@ r7 is now accumulator
and r11, r9, r6, lsr#8 @ r11 = (idst >> 8) & 0x00ff00ff
mla r11, r8, r11, r10 @ r11 = r11 * alpha + 0x00800080. This is non-optimal;
@ it should be mla r11,r11,r8,r10 (2 cyc quicker) but
@ ARM requires Rd!=Rs and we don't have a reg to spare.
bic r11, r11, r9 @ r11 &= 0xff00ff00
add r7, r7, r11 @ r7 += that whole thing
and r11, r9, r6 @ r11 = idst & 0x00ff00ff
mla r6, r11, r8, r10 @ r6 = r11 * alpha + 0x00800080
bic r6, r6, r9 @ r6 &= 0xff00ff00
add r7, r7, r6, lsr #8 @ r7 += that whole thing >> 8
str r7, [r1], #4 @ store the pixel
@@ Increment and test. 4 cyc.
subs r12, r12, #1 @ dec the counter
bne 2b @ loop while != 0
@@ End of inner loop.
b 4f @ skip the other version - 3 cyc.
@@ Inner loop 2 - opacity version. 4-5 extra cycles per loop iteration.
@@ We skip the adjustment if A=0 because it results in A=1, which is wrong.
3:
ldr r6, [r1] @ r6 = idst, x++
ldr r7, [r0], #4 @ r7 = isrc
movs r8, r7, lsr #24 @ r8 = A
mulne r8, r14, r8 @ multiply by the opacity (unless A=0)
movne r8, r8, lsr #8 @ divide by 256 (unless A=0)
addne r8, r8, #1 @ and add 1 (unless A=0)
sub r8, r7, r8 @ r8 = 255 - adjusted A
@ r7 is now accumulator
and r11, r9, r6, lsr#8 @ r11 = (idst >> 8) & 0x00ff00ff
mla r11, r8, r11, r10 @ r11 = r11 * alpha + 0x00800080. This is non-optimal;
@ it should be mla r11,r11,r8,r10 (2 cyc quicker) but
@ ARM requires Rd!=Rs and we don't have a reg to spare.
bic r11, r11, r9 @ r11 &= 0xff00ff00
add r7, r7, r11 @ r7 += that whole thing
and r11, r9, r6 @ r11 = idst & 0x00ff00ff
mla r6, r11, r8, r10 @ r6 = r11 * alpha + 0x00800080
bic r6, r6, r9 @ r6 &= 0xff00ff00
add r7, r7, r6, lsr #8 @ r7 += that whole thing >> 8
str r7, [r1], #4 @ store the pixel
@@ Increment and test. 4 cyc.
subs r12, r12, #1 @ dec the counter
bne 2b @ loop while != 0
@@ End of inner loop, option 2.
@@ Update, test, jump.
4: sub r0, r0, r2, lsl #2 @ back to sx
sub r1, r1, r2, lsl #2 @ back to dx
add r0, r0, r4, lsl #2 @ on to next row
add r1, r1, r5, lsl #2 @ ditto
cmp r1, r3 @ are we there yet?
blo 1b @ nope - keep looping
ldmia sp!, {r4-r12, pc} @ return, some inordinately large number of cycles
@@ _HD_ARM_LowerBlit_f - The core of blit, non-alpha non-scaling.
@@ Nice and fast. (or not.. see ->NOTE<- below)
@@ void _HD_ARM_LowerBlit_f (hd_surface src, uint32 sx, uint32 sy,
@@ hd_surface dest, uint32 dx, uint32 dy, uint32 dw, uint32 dh)
@@ ->NOTE<-: This func could do with some optimization, loop unrolling, etc. It's too slow.
.globl _HD_ARM_LowerBlit_Fast
_HD_ARM_LowerBlit_Fast_StoreSP: .long 0
_HD_ARM_LowerBlit_Fast:
stmdb sp!, {r4-r12, lr} @ save 19 regs, 2N + 9S cyc.
@@ Stack layout:
@@ dx at [sp, #40]
@@ dy at [sp, #44]
@@ dw at [sp, #48]
@@ dh at [sp, #52]
@@ Register usage:
@@ r0 current pixel in src
@@ r1 current pixel in dest
@@ r2-r3 currently unused
@@ r4 width of the blitted region (dw)
@@ r5 address of dest(dx,dy+dh)
@@ r6 width of src (src[0])
@@ r7 width of dest (dest[0])
@@ r8-r11 transfer regs
@@ r12 number of pixels left to transfer in this row
@@ r14 currently unused
@@ r0 is already loaded with src - set it up to point to (sx,sy). 2N + 5S + 2I cyc.
ldr r6, [r0] @ load width. 1N+1I+1S
add r8, r0, #8 @ r8 = beginning of row offset pointers
ldr r8, [r8, r2, lsl #2] @ load offset of sy'th row. 1N+1I+1S
add r0, r0, r8, lsl #2 @ r0 = beginning of sy'th row
add r0, r0, r1, lsl #2 @ r0 = address of pixel (sx,sy)
@@ set r1 up to point to (dx,dy) in dest - currently r3=dest. 3N + 10S + 3I cyc.
ldr r7, [r3] @ load width. 1N+1I+1S
add r8, sp, #40 @ r8 = beginning of dx,dy,dw,dh on stack
ldmia r8, {r1, r2, r4, r5} @ r1=dx, r2=dy, (r3=dest), r4=dw, r5=dh. 1N + 1I + 4S.
add r8, r3, #8 @ r8 = beginning of row offset pointers
ldr r8, [r8, r2, lsl #2] @ load offset of dy'th row. 1N+1I+1S
add r3, r3, r8, lsl #2 @ r3 = beginning of dy'th row
add r1, r3, r1, lsl #2 @ r1 = address of pixel (dx,dy)
@@ set r5 up - this one's easy, it's quicker to do a mla here than
@@ load the row offset ptr. 2S + 2I cyc.
mov r5, r5, lsl #2
mla r5, r7, r5, r1 @ r5 = (dtw * dh * 4) + r1
@@ save sp
str sp, _HD_ARM_LowerBlit_Fast_StoreSP
@@ Ok, r0-r7 are set up. Let's transfer some pixels.
1:
@@ Inits - 3S.
mov r12, r4
cmp r12, #4 @ At least 4 pix left? 1S
2: ldmhsia r0!, {r8-r11} @ Load four, inc, ... 4S 1N 1I
stmhsia r1!, {r8-r11} @ save four, inc, ... 3S 2N
subhs r12, r12, #4 @ Update r12. 1S
cmphs r12, #4 @ At least 4 left? 1S
ldmhsia r0!, {r8-r11} @ Load four, inc, ... 4S 1N 1I
stmhsia r1!, {r8-r11} @ save four, inc, ... 3S 2N
subhs r12, r12, #4 @ Update r12. 1S
cmphs r12, #4 @ At least 4 left? 1S
ldmhsia r0!, {r8-r11} @ Load four, inc, ... 4S 1N 1I
stmhsia r1!, {r8-r11} @ save four, inc, ... 3S 2N
subhs r12, r12, #4 @ Update r12. 1S
cmphs r12, #4 @ At least 4 left? 1S
ldmhsia r0!, {r8-r11} @ Load four, inc, ... 4S 1N 1I
stmhsia r1!, {r8-r11} @ save four, inc, ... 3S 2N
subhs r12, r12, #4 @ Update r12. 1S
cmphs r12, #4 @ At least 4 left? 1S
bhs 2b @ If so, keep looping. 2S 1N
@@ Total for 16 pixels: -> 38S 13N 4I (+1S first time)
@@ Copy 4-byte units till we're done. 4S if done, 3N + 2I + 5S if not done
cmp r12, #0
beq 4f
3: ldrne r8, [r0], #4 @ If r8 was != 16, and we're still here (so it's < 16),
strne r8, [r1], #4 @ transfer 4 bytes. 2N+2I+2S for these two.
subnes r12, r12, #1 @ and update r12.
bne 3b @ If we're now <16 bytes, just repeat the word-copying part till
@ we're done.
@@ Update r0, r1. 2S.
4: sub r8, r6, r4
add r0, r0, r8, lsl #2
sub r8, r7, r4
add r1, r1, r8, lsl #2
@@ Test and branch. 3S + 1N.
cmp r1, r5
blo 1b
ldr sp, _HD_ARM_LowerBlit_Fast_StoreSP
ldmia sp!, {r4-r12, pc} @ return - 11S + 1I + 2N
@@ Update the LCD on the 5g (video) iPod.
@@ We ASM this because gcc is braindead and emits one function
@@ call per outl etc., and we need every cycle we can get here.
@@ void _HD_ARM_Update5G (uint16 *fb, int x, int y, int w, int h)
@@ Write a 32-bit value to the BCM. Uses r10-r11 for temp, needs r12 to be loaded with 0x30000000.
@@ Takes 8S + 10N + 1I cycles.
.macro bcmw32 addr, value
orr r11, r12, #0x10000 @ r11 = 0x30010000
strh \addr, [r11] @ store the low hword
mov r10, \addr, lsr #16 @ get the high one in r10
strh r10, [r11] @ and store it
orr r11, r12, #0x30000 @ r11 = 0x30030000
9: ldrh r10, [r11] @ read status
tst r10, #2 @ check the ready bit
beq 9b @ loop until it's set
strh \value, [r12] @ store low hword of value to 0x30000000
mov r10, \value, lsr #16 @ get the high hword
strh r10, [r12] @ and store it
.endm
.macro bcmr32 addr, value
orr r11, r12, #0x20000 @ r11 = 0x30020000
9: ldrh r10, [r11] @ load address reg
tst r10, #1 @ test bit 0
beq 9b @ loop until set
strh \addr, [r11] @ store the low hword
mov r10, \addr, lsr #16 @ get the high hword
strh r10, [r11] @ and store it
orr r11, r12, #0x30000 @ r12 = 0x30030000
9: ldrh r10, [r11] @ load status reg
tst r10, #16 @ test bit 4
beq 9b @ loop until set
ldrh \value, [r12] @ read the low hword
ldrh r10, [r12] @ and the high one
orr \value, \value, r10, lsl #16 @ put the high hword in place
.endm
.globl _HD_ARM_Update5G
_HD_ARM_Update5G:
stmdb sp!, {r4-r12, lr}
@@ Do a finishup if we need one.
ldr r4, =need_finishup
ldr r4, [r4]
cmp r4, #0
beq 1f
@@ OK, we need it. Do it.
mov r12, #0x30000000 @ r12 = 0x30000000
mvn r5, #0 @ r5 = 0xFFFFFFFF
and r5, r5, r5, lsr #16 @ r5 = 0x0000FFFF
mvn r6, #0x50000 @ r6 = 0xFFFAFFFF
bic r6, r6, r6, lsr #16 @ r6 = 0xFFFA0005
mov r7, #0x1F8
2: bcmr32 r7, r9 @ read some reg
cmp r9, r6 @ is it 0xFFFA0005?
cmpne r9, r5 @ is it 0xFFFF?
beq 2b @ if either, keep looping
bcmr32 r8, r9 @ and read 0x1fc again
@@ We've stored 10 registers, so h is at [sp, #40].
1: ldr r4, [sp, #40] @ 3 cyc
@@ Register usage: r0 = fb, r1 = x, r2 = y, r3 = w, r4 = h, r5 = count (for now)
@@ r12 = 0x30000000, r8-r11 = temp, r5 = beginning of current row (later on)
@@ r6 = x+w-1, r7 = y+h-1
@@ Align x and w to a 2-byte boundary. 5S cyc.
tst r1, #1 @ is x aligned?
subne r1, r1, #1 @ align it down if so
addne r3, r3, #1 @ and inc the width to compensate
tst r3, #1 @ is w aligned?
addne r3, r3, #1 @ inc it if not
@@ Set up the drawing region.
mul r5, r4, r3 @ r5 (count) = width * height
add r5, r5, r5 @ r5 *= 2
add r6, r1, r3 @ r6 = x + width
sub r6, r6, #1 @ minus 1
add r7, r2, r4 @ r7 = y + height
sub r7, r7, #1 @ minus 1
@@ Send the commands to set up the rect.
mov r12, #0x30000000 @ load r12 for the bcmw32 macro
mov r8, #0x1F8
mvn r9, #0x50000 @ r9 = 0xFFFAFFFF
bic r9, r9, r9, lsr #16 @ r9 = 0xFFFAFFFF & 0xFFFF0005 = 0xFFFA0005 = what we want
bcmw32 r8, r9
mov r8, #0xE0000 @ load first addr
mov r9, #0x34
bcmw32 r8, r9 @ send command (addr = 0xE0000)
add r8, r8, #4
bcmw32 r8, r1 @ start_horiz (@0xE0004)
add r8, r8, #4
bcmw32 r8, r2 @ start_vert (@0xE0008)
add r8, r8, #4
bcmw32 r8, r6 @ max_horiz (@0xE000C)
add r8, r8, #4
bcmw32 r8, r7 @ max_vert (@0xE0010)
add r8, r8, #4
bcmw32 r8, r5 @ count (@0xE0014)
add r8, r8, #4
bcmw32 r8, r5 @ count (@0xE0018)
add r8, r8, #4
mov r9, #0
bcmw32 r8, r9 @ zero (@0xE001C)
@@ Write the destination address out as two 16-bit values.
mov r9, #0x20 @ r9 = low hword of address (0xE0020)
orr r8, r12, #0x10000 @ r8 = 0x30010000
strh r9, [r8] @ and store it
mov r9, #0xE @ high hword of 0xE0020
strh r9, [r8] @ and store it
orr r8, r12, #0x30000 @ r8 = 0x30030000
1: ldrh r9, [r8] @ read status
tst r9, #2 @ test ready bit
beq 1b @ loop until set
@@ Set up pointers for the write.
mov r8, #320*2 @ r8 = lcd width * 2 [bytes per pixel]
mla r5, r2, r8, r0 @ r5 = fb + 320*y
add r0, r5, r1, lsl #1 @ + x
mov r5, r0 @ r5 = r0 = pointer to first pixel
@ we'll increment r0 and keep r5 pointing to the first
@@ In the inner loop we'll use r11 as a counter (number of pixels on this row).
1: mov r11, r3 @ r11 = width
2: cmp r11, #8 @ at least eight pixels left?
ldmhsia r0!, {r6, r7, r8, r9} @ if so: load 8,
strhsh r6, [r12] @ store #1
movhs r6, r6, lsr #16 @ shift in #2
strhsh r6, [r12] @ store #2
strhsh r7, [r12] @ store #3
movhs r7, r7, lsr #16 @ shift in #4
strhsh r7, [r12] @ store #4
strhsh r8, [r12] @ store #5
movhs r8, r8, lsr #16 @ shift in #6
strhsh r8, [r12] @ store #6
strhsh r9, [r12] @ store #7
movhs r9, r9, lsr #16 @ shift in #8
strhsh r9, [r12] @ store #8
subhs r11, r11, #8 @ update count
bhi 2b @ keep going if there's more
3: ldrne r8, [r0], #4 @ if not (at least eight left): load 2,
strneh r8, [r12] @ store #1
movne r8, r8, lsr #16 @ shift in #2
strneh r8, [r12] @ store #2
subnes r11, r11, #2 @ update the counter. r11 had better be even!
bne 3b @ and keep going (in two-at-a-time mode)
@@ Ok, we're done. Advance the pointer to the next row.
add r0, r5, #320*2 @ next row
mov r5, r0 @ for r5 too
subs r4, r4, #1 @ dec h
bne 1b @ and keep looping if it's nonzero
@@ Start the finishup, and remember to finish it next tme.
mov r12, #0x30000000 @ r12 = 0x30000000
orr r11, r12, #0x30000 @ r11 = 0x30030000
mov r10, #0x31
strh r10, [r11] @ store the 0x31 thing
mov r8, #0x1FC
bcmr32 r8, r9 @ read 0x1FC to start it
mov r0, #1
ldr r1, =need_finishup
str r0, [r1]
ldmia sp!, {r4-r12, pc} @ return
.pool
@@ Really fast memset 0 for screen clearing.
@@ void _HD_ARM_ClearScreen (uint32 *fb, uint32 pixels)
@@ Equivalent to memset (fb, 0, pixels*4)
.globl _HD_ARM_ClearScreen
_HD_ARM_ClearScreen:
stmdb sp!, {r4-r12, lr} @ 9S 2N
mov r2, #0 @ Zero 1S
mov r3, #0 @ twelve 1S
mov r4, #0 @ registers 1S
mov r5, #0 @ for 1S
mov r6, #0 @ stmia 1S
mov r7, #0 @ and 1S
mov r8, #0 @ really 1S
mov r9, #0 @ fast 1S
mov r10, #0 @ block 1S
mov r11, #0 @ memsetting. 1S
mov r12, #0 @ 1S
mov r14, #0 @ 1S
@@ Total for setup: -> 21S 2N
cmp r1, #12 @ At least 12 pix left? 1S
1: stmhsia r0!, {r2-r12, r14} @ Zero twelve px and inc. 11S 2N
subhs r1, r1, #12 @ Update r1. 1S
cmphs r1, #12 @ At least 12 left? 1S
stmhsia r0!, {r2-r12, r14} @ Zero twelve and inc. 11S 2N
subhs r1, r1, #12 @ Update r1. 1S
cmphs r1, #12 @ At least 12 left? 1S
stmhsia r0!, {r2-r12, r14} @ Zero twelve and inc. 11S 2N
subhs r1, r1, #12 @ Update r1. 1S
cmphs r1, #12 @ At least 12 left? 1S
stmhsia r0!, {r2-r12, r14} @ Zero twelve and inc. 11S 2N
subhs r1, r1, #12 @ Update r1. 1S
cmphs r1, #12 @ At least 12 left? 1S
bhs 1b @ If so, keep looping. 2S 1N
@@ Total for 48 pixels: -> 54S 9N (+1S first time)
cmp r1, #0 @ Only do the loop if r1 != 0. 1S
2: strne r2, [r0], #4 @ Zero one pixel and inc. 2N
subnes r1, r1, #1 @ Update r1. 1S
bne 2b @ Keep looping while r1 != 0. 2S 1N
@@ Total for each odd pixel: -> 3S 3N (+1S first time)
ldmia sp!, {r4-r12, pc} @ Return.
@@ Update the LCD on a Photo, Color, or Nano.
@@ Possible values for `type':
@@ 0 220x176, type 0 LCD (on a Photo)
@@ 1 220x176, type 1 LCD (on a Color)
@@ 2 176x132, type 1 LCD with weird rect specifications (on a Nano)
@@ void _HD_ARM_UpdatePhoto (uint16 *fb, int x, int y, int w, int h, int type)
@@ wwait - waits for the LCD to be write ready. clobbers r10, r11. assumes r12 = 0x70008A0C.
.macro wwait
mov r10, #0x10000 @ r10 ~= 64 thousand ~= 1ms
9: ldr r11, [r12] @ r11 = *0x70008A0C
subs r10, r10, #1 @ timer--
beq 9f
tst r11, #0x80000000 @ is high bit set?
beq 9b @ loop until it is
9:
.endm
@@ sendlo - sends a low value (?) clobbers r10, r11, expects r12 = 0x70008A0C.
@@ val *cannot* be r10 or r11
.macro sendlo val
wwait
orr r11, \val, #0x80000000
str r11, [r12]
.endm
@@ sendhi - sends a high value (?) clobbers r10, r11, expects r12 = 0x70008A0C.
@@ val *cannot* be r10 or r11
.macro sendhi val
wwait
orr r11, \val, #0x81000000
str r11, [r12]
.endm