-
Notifications
You must be signed in to change notification settings - Fork 0
/
matrix.S
247 lines (237 loc) · 5.68 KB
/
matrix.S
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
.global makematrix
.global resizematrix
.global copymatrix
.global freematrix
.global multiplymatrix
.global printmatrix
.global matrixstart
.global forcematrix
.global identitymatrix
.global beziermatrix
.global hermitematrix
.global blankmatrix
.global newl
.global one
.data
.align 16
matrixstart:
.quad 0
format:
.asciz "%f "
.align 16
newl:
.asciz "\n"
.align 16
indices:
.quad 0,4,8,12
one:
.double 1.0
scalar:
.double 1.0
identmatrix:
.double 1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1
beziermatrix:
.double -1,3,-3,1,3,-6,3,0,-3,3,0,0,1,0,0,0
hermitematrix:
.double 2,-3,0,1,-2,3,0,0,1,-2,1,0,1,-1,0,0
blankmatrix:
.double 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0
//param order rdi, rsi, rdx, rcx, r8, r9
//return value in rax
.text
makematrix://makes a matrix with %rdi columns and %rsi rows, and returns its reference in %rax
//columns put on the stack
push %rdi//number of columns
push %rsi
movq $40, %rdi
call malloc@PLT
pop %rsi
pop %rdi
mov %rax, matrixstart(%rip)
//allocate space for the matrix
push %rdi
push %rsi
imul %rsi, %rdi
sal $3, %rdi//length of array in bytes
call malloc@PLT
pop %rsi
pop %rdi
//populates fields of the matrix pseudo-struct properly
movq matrixstart(%rip), %rcx
movq %rax, (%rcx)
movq %rdi, 8(%rcx)
movq %rsi, 16(%rcx)
push %rcx
movq $0, 24(%rcx)
push %rcx
sal $5, %rdi//number of bytes for constant
call malloc@PLT
pop %rdi
mov %rax, 32(%rdi)
//sets return value
pop %rax
ret
resizematrix://adds %rsi columns to matrix %rdi, no return val
movq %rdi, matrixstart(%rip)
movq 8(%rdi), %rdx
addq %rsi, %rdx
movq %rdx, 8(%rdi)
movq %rdx, %rsi
push %rsi
push %rdi
shlq $5, %rsi
movq (%rdi), %rdi
call realloc@PLT
movq matrixstart(%rip), %rdi
movq %rax, (%rdi)
pop %rdi
pop %rsi
sal $3, %rsi
movq 32(%rdi), %rdi
call realloc@PLT
movq matrixstart(%rip), %rdi
movq %rax, 32(%rdi)
ret
copymatrix://makes a deepcopy of matrix %rdi, returns the pointer to that new matrix
push %rdi
mov 8(%rdi), %rdi
mov $4, %rsi
call makematrix
pop %r8
push %rax
mov %rax, matrixstart(%rip)
//new matrix made, copy fields of pseudo-struct, find array addresses, and memcpy
movq (%rax), %rdi
movq (%r8), %rsi
movq 8(%r8), %rdx
imul 16(%r8), %rdx
shl $3, %rdx
push %r8
call memcpy@PLT
pop %r8
pop %rax
mov 24(%r8), %r9
mov %r9, 24(%rax)
mov 32(%r8), %rsi
mov 32(%rax), %rdi
mov 8(%r8), %rdx
sal $5, %rdx//this is the
call memcpy@PLT
mov matrixstart(%rip), %rax
ret
freematrix://frees the memory of matrix %rdi, no return value
mov %rdi, matrixstart(%rip)
movq (%rdi), %rdi
call free@PLT//array freed
movq matrixstart(%rip), %rdi
mov 32(%rdi), %rdi
call free@PLT
movq matrixstart(%rip), %rdi
call free@PLT//pseudo-struct freed
ret
printmatrix://prints matrix %rdi, no return value
mov %rdi, matrixstart(%rip)
push %r14
push %r13
push %rbx
sub $8, %rsp //align to the 16 byte boundary, there's a better way to do this
mov $0, %r13
poloop:
mov matrixstart(%rip), %rcx
cmp 16(%rcx), %r13
je peloop
mov $0, %r14
piloop:
mov %r14, %rax
mov %r13, %rbx
shl $2, %rax
add %rax, %rbx
mov matrixstart(%rip), %rcx
movq (%rcx), %rdx
movsd (%rdx,%rbx,8), %xmm0 //found double to print, set arguments to print it
lea format(%rip), %rdi
mov $1, %rax
call printf@PLT
inc %r14
mov matrixstart(%rip), %rcx
//loop stuff
cmp 24(%rcx),%r14
jl piloop
inc %r13
lea newl(%rip), %rdi
mov $0, %rax
call printf@PLT
jmp poloop
peloop:
add $8, %rsp
pop %rbx
pop %r13
pop %r14
ret
identitymatrix:
lea identmatrix(%rip), %rax
call forcematrix
ret
forcematrix://turns square matrix %rdi into an bezier matrix, matrix to force is in rax
movq $4,8(%rdi)
movq $4,24(%rdi)
mov (%rdi),%rdi
push %r14
mov $0, %r14
boloop:
vmovups (%rax,%r14,8),%ymm0
vmovupd %ymm0, (%rdi,%r14,8)
add $4, %r14
cmp $16, %r14
jne boloop
pop %r14
ret
multiplymatrix: //multiplies matrix1 rdi, matrix2 rsi, stores resulting matrix in rsi, no return value
vmovdqu indices(%rip), %ymm4
vpcmpeqq %ymm8, %ymm8, %ymm8
mov (%rdi),%rax
vmovupd %ymm8, %ymm3
vgatherqpd %ymm3, (%rax,%ymm4,8), %ymm0
vmovupd %ymm8, %ymm3
vgatherqpd %ymm3, 8(%rax,%ymm4,8), %ymm1
vmovupd %ymm8, %ymm3
vgatherqpd %ymm3, 16(%rax,%ymm4,8), %ymm2
vmovupd %ymm8, %ymm5
vgatherqpd %ymm5, 24(%rax,%ymm4,8), %ymm3
mov 24(%rsi), %r10
xor %rax, %rax
xor %rcx, %rcx
mov (%rsi), %rdx
loop_layer_0:
vmovupd (%rdx, %rcx), %ymm5
//properly accumulate data
vmulpd %ymm0, %ymm5, %ymm9
vmulpd %ymm1, %ymm5, %ymm10
vmulpd %ymm2, %ymm5, %ymm11
vmulpd %ymm3, %ymm5, %ymm12
vextractf128 $1, %ymm9, %xmm5
vextractf128 $1, %ymm10, %xmm6
vextractf128 $1, %ymm11, %xmm7
vextractf128 $1, %ymm12, %xmm8
vaddpd %xmm9, %xmm5, %xmm9
vaddpd %xmm10, %xmm6, %xmm10
vaddpd %xmm11, %xmm7, %xmm11
vaddpd %xmm12, %xmm8, %xmm12
vunpckhpd %xmm9, %xmm9, %xmm5
vunpckhpd %xmm10, %xmm10, %xmm6
vunpckhpd %xmm11, %xmm11, %xmm7
vunpckhpd %xmm12, %xmm12, %xmm8
vaddsd %xmm5, %xmm9, %xmm9
vaddsd %xmm6, %xmm10, %xmm10
vaddsd %xmm7, %xmm11, %xmm11
vaddsd %xmm8, %xmm12, %xmm12
vmovsd %xmm9,(%rdx,%rcx)
vmovsd %xmm10,8(%rdx,%rcx)
vmovsd %xmm11,16(%rdx,%rcx)
vmovsd %xmm12,24(%rdx,%rcx)
//loop stuff
inc %rax
add $32, %rcx
cmp %r10,%rax
jl loop_layer_0
ret