0
|
1 // types:
|
|
2
|
|
3 typedef struct { int x, y; } s2i;
|
|
4 typedef struct { int x, y, z, w; } s4i;
|
|
5 typedef struct { int x, y, z, w, a, b, c, d; } s8i;
|
|
6
|
|
7 typedef struct { float x, y; } s2f;
|
|
8 typedef struct { float x, y, z, w; } s4f;
|
|
9
|
|
10 typedef struct { double x; } s1d;
|
|
11 typedef struct { double x, y; } s2d;
|
|
12 typedef struct { double x, y, z, w; } s4d;
|
|
13
|
|
14 typedef struct { long double x; } s1q;
|
|
15
|
|
16 #if defined(__ARM_NEON__)
|
|
17 #include <arm_neon.h>
|
|
18 #endif
|
|
19
|
|
20 #if defined(__i386__) || defined(__x86_64__) && ( defined(__MMX__) || defined(__SSE__) )
|
|
21 #if defined(__GNUC__) && (__GNUC__ >= 4) && (__GNUC_MINOR__ > 2)
|
|
22
|
|
23 #include <immintrin.h>
|
|
24
|
|
25 #if defined(__AVX__)
|
|
26 #define V256 1
|
|
27 typedef __m256 v256_t;
|
|
28 #endif
|
|
29
|
|
30 #else
|
|
31
|
|
32 #include <emmintrin.h>
|
|
33
|
|
34 #endif
|
|
35
|
|
36 #if defined(__MMX__)
|
|
37 #define V64 1
|
|
38 typedef __m64 v64_t;
|
|
39 #endif
|
|
40
|
|
41 #if defined(__SSE__)
|
|
42 #define V128 1
|
|
43 typedef __m128 v128_t;
|
|
44 #endif
|
|
45
|
|
46 #endif
|
|
47
|
|
48 #if V64
|
|
49
|
|
50 typedef struct { v64_t f1; } s1w;
|
|
51 typedef struct { v64_t f1, f2; } s2w;
|
|
52 typedef struct { v64_t f1, f2, f3, f4; } s4w;
|
|
53
|
|
54 #endif
|
|
55
|
|
56 #if V128
|
|
57
|
|
58 typedef struct { v128_t f1; } s1x;
|
|
59 typedef struct { v128_t f1, f2; } s2x;
|
|
60
|
|
61 typedef struct { v64_t f1; v128_t f2; } swx;
|
|
62
|
|
63 #endif
|
|
64
|
|
65 // x86_64:
|
|
66
|
|
67 /*
|
|
68 x86_64
|
|
69 ------------
|
|
70 (v{ii}) regs int 8
|
|
71 (v{iiii}) regs int 16
|
|
72 (v{iiiiiiii}) stack 32
|
|
73 (v{ff}) regs sse 8
|
|
74 (v{ffff}) regs sse 16
|
|
75 (v{d}) regs sse 8
|
|
76 (v{dd}) regs sse 16
|
|
77 (v{dddd}) stack 32
|
|
78 (v{w}) regs sse 8
|
|
79 (v{ww}) regs sse 16
|
|
80 (v{wwww}) stack 32
|
|
81 (v{x}) regs sse 16
|
|
82 (v{xx}) stack 32
|
|
83 (v(wx}) stack 32
|
|
84 (v{xw}) stack 32
|
|
85 (v{y}) stack 32
|
|
86 (v{yy}{x}) stack 32
|
|
87 (v{ifif})
|
|
88 */
|
|
89
|
|
90 #if V256
|
|
91
|
|
92 typedef struct { v256_t f1; } s1y;
|
|
93 typedef struct { v256_t f1; v256_t f2; } s2y;
|
|
94
|
|
95 typedef v256_t y;
|
|
96
|
|
97 extern void f_y(y a1); void call_f_y() { y a1 = {0,}; f_y(a1); }
|
|
98
|
|
99
|
|
100 extern void f_s2ys1x(s2y a1,s1x a2); void call_f_s2ys1x() { s2y a1 = {0,}; s1x a2 = {0,}; f_s2ys1x(a1,a2); }
|
|
101 extern void f_s1xs2y(s1x a1,s2y a2); void call_f_s1xs2y() { s1x a1 = {0,}; s2y a2 = {0,}; f_s1xs2y(a1,a2); }
|
|
102
|
|
103 extern void f_s1y(s1y a1); void call_f_s1y() { s1y a1 = {0,}; f_s1y(a1); }
|
|
104 extern void f_s2y(s2y a1); void call_f_s2y() { s2y a1 = {0,}; f_s2y(a1); }
|
|
105
|
|
106 extern void f_s1ys1y(s1y a1,s1y a2); void call_f_s1ys1y() { s1y a1 = {0,}, a2 = {0,}; f_s1ys1y(a1,a2); }
|
|
107 extern void f_s2ys1y(s2y a1,s1y a2); void call_f_s2ys1y() { s2y a1 = {0,}; s1y a2 = {0,}; f_s2ys1y(a1,a2); }
|
|
108
|
|
109 #endif
|
|
110
|
|
111 #if V128
|
|
112
|
|
113 extern void f_swx(swx a1); void call_f_swx() { swx a1 = {0,}; f_swx(a1); }
|
|
114
|
|
115 extern void f_s2x(s2x a1); void call_f_s2x() { s2x a1 = {0,}; f_s2x(a1); }
|
|
116 extern void f_s1x(s1x a1); void call_f_s1x() { s1x a1 = {0,}; f_s1x(a1); }
|
|
117
|
|
118 #endif
|
|
119
|
|
120 #if V64
|
|
121
|
|
122 extern void f_s4w(s4w a1); void call_f_s4w() { s4w a1 = {0,}; f_s4w(a1); }
|
|
123 extern void f_s2w(s2w a1); void call_f_s2w() { s2w a1 = {0,}; f_s2w(a1); }
|
|
124 extern void f_s1w(s1w a1); void call_f_s1w() { s1w a1 = {0,}; f_s1w(a1); }
|
|
125
|
|
126 #endif
|
|
127
|
|
128 // one composite argument:
|
|
129
|
|
130 extern void f_s2i(s2i a1); void call_f_s2i() { s2i a1 = {0,}; f_s2i(a1); }
|
|
131 extern void f_s4i(s4i a1); void call_f_s4i() { s4i a1 = {0,}; f_s4i(a1); }
|
|
132 extern void f_s8i(s8i a1); void call_f_s8i() { s8i a1 = {0,}; f_s8i(a1); }
|
|
133
|
|
134 extern void f_s2f(s2f a1); void call_f_s2f() { s2f a1 = {0,}; f_s2f(a1); }
|
|
135 extern void f_s4f(s4f a1); void call_f_s4f() { s4f a1 = {0,}; f_s4f(a1); }
|
|
136
|
|
137 extern void f_s1d(s1d a1); void call_f_s1d() { s1d a1 = {0,}; f_s1d(a1); }
|
|
138 extern void f_s2d(s2d a1); void call_f_s2d() { s2d a1 = {0,}; f_s2d(a1); }
|
|
139 extern void f_s4d(s4d a1); void call_f_s4d() { s4d a1 = {0,}; f_s4d(a1); }
|
|
140
|
|
141 extern void f_s1q(s1q a1); void call_f_s1q() { s1q a1 = {0,}; f_s1q(a1); }
|
|
142
|
|
143
|
|
144 // multiple arguments:
|
|
145
|
|
146 extern void f_s2ds2d(s2d a1, s2d a2);
|
|
147 void call_f_s2ds2d() {
|
|
148 s2d a1={0,};
|
|
149 s2d a2={0,};
|
|
150 f_s2ds2d(a1,a2);
|
|
151 }
|
|
152
|
|
153 typedef struct { float x; int y; float z; int w; } sfifi;
|
|
154 extern void f_sfifi(sfifi a1);
|
|
155 void call_f_sfifi() { sfifi a1 = {0,1,2,3,}; f_sfifi(a1); }
|
|
156
|
|
157 // int/float 4-byte alignment.
|
|
158
|
|
159 typedef struct { int x; float y; int z; float w; } sifif;
|
|
160 extern void f_sifif(sifif a1);
|
|
161 void call_f_sifif() { sifif a1 = {0,1,2,3,}; f_sifif(a1); }
|
|
162
|
|
163 #define FULL_T char, short, int, long int, char, char, char, char
|
|
164 #define FULL_V 0,1,2,3,4,5,6,7
|
|
165
|
|
166 #define REMAIN64_T char, char
|
|
167 #define REMAIN64_V 0,1
|
|
168 extern void f_full_sifif(FULL_T, sifif a1); void call_f_full_sifif() { sifif a1 = {0,}; f_full_sifif(FULL_V,a1); }
|
|
169 extern void f_remain64_sifif(REMAIN64_T, sifif a1); void call_f_remain64_sifif() { sifif a1 = {0,}; f_remain64_sifif(REMAIN64_V,a1); }
|
|
170
|
|
171 typedef struct { float f1,f2,f3; char f4; } sfffc;
|
|
172 extern void f_sfffc(sfffc a1);
|
|
173 void call_f_sfffc() { sfffc a1 = {0,1,2,3,}; f_sfffc(a1); }
|
|
174
|
|
175 typedef struct { char f1; float f2,f3,f4; } scfff;
|
|
176 extern void f_scfff(scfff a1);
|
|
177 void call_f_scfff() { scfff a1 = {0,1,2,3,}; f_scfff(a1); }
|
|
178
|