Vector Assembly Code Examples
The following are provided as non-normative text to help explain the vector ISA.
Vector-vector add example
# vector-vector add routine of 32-bit integers # void vvaddint32(size_t n, const int*x, const int*y, int*z) # { for (size_t i=0; i<n; i++) { z[i]=x[i]+y[i]; } } # # a0 = n, a1 = x, a2 = y, a3 = z # Non-vector instructions are indented vvaddint32: vsetvli t0, a0, e32, m1, ta, ma # Set vector length based on 32-bit vectors vle32.v v0, (a1) # Get first vector sub a0, a0, t0 # Decrement number done slli t0, t0, 2 # Multiply number done by 4 bytes add a1, a1, t0 # Bump pointer vle32.v v1, (a2) # Get second vector add a2, a2, t0 # Bump pointer vadd.vv v2, v0, v1 # Sum vectors vse32.v v2, (a3) # Store result add a3, a3, t0 # Bump pointer bnez a0, vvaddint32 # Loop back ret # Finished
Example with mixed-width mask and compute.
# Code using one width for predicate and different width for masked # compute. # int8_t a[]; int32_t b[], c[]; # for (i=0; i<n; i++) { b[i] = (a[i] < 5) ? c[i] : 1; } # # Mixed-width code that keeps SEW/LMUL=8 loop: vsetvli a4, a0, e8, m1, ta, ma # Byte vector for predicate calc vle8.v v1, (a1) # Load a[i] add a1, a1, a4 # Bump pointer. vmslt.vi v0, v1, 5 # a[i] < 5? vsetvli x0, a0, e32, m4, ta, mu # Vector of 32-bit values. sub a0, a0, a4 # Decrement count vmv.v.i v4, 1 # Splat immediate to destination vle32.v v4, (a3), v0.t # Load requested elements of C, others undisturbed sll t1, a4, 2 add a3, a3, t1 # Bump pointer. vse32.v v4, (a2) # Store b[i]. add a2, a2, t1 # Bump pointer. bnez a0, loop # Any more?
Memcpy example
# void *memcpy(void* dest, const void* src, size_t n) # a0=dest, a1=src, a2=n # memcpy: mv a3, a0 # Copy destination loop: vsetvli t0, a2, e8, m8, ta, ma # Vectors of 8b vle8.v v0, (a1) # Load bytes add a1, a1, t0 # Bump pointer sub a2, a2, t0 # Decrement count vse8.v v0, (a3) # Store bytes add a3, a3, t0 # Bump pointer bnez a2, loop # Any more? ret # Return
Conditional example
# (int16) z[i] = ((int8) x[i] < 5) ? (int16) a[i] : (int16) b[i]; # loop: vsetvli t0, a0, e8, m1, ta, ma # Use 8b elements. vle8.v v0, (a1) # Get x[i] sub a0, a0, t0 # Decrement element count add a1, a1, t0 # x[i] Bump pointer vmslt.vi v0, v0, 5 # Set mask in v0 vsetvli x0, x0, e16, m2, ta, mu # Use 16b elements. slli t0, t0, 1 # Multiply by 2 bytes vle16.v v2, (a2), v0.t # z[i] = a[i] case vmnot.m v0, v0 # Invert v0 add a2, a2, t0 # a[i] bump pointer vle16.v v2, (a3), v0.t # z[i] = b[i] case add a3, a3, t0 # b[i] bump pointer vse16.v v2, (a4) # Store z add a4, a4, t0 # z[i] bump pointer bnez a0, loop
SAXPY example
# void # saxpy(size_t n, const float a, const float *x, float *y) # { # size_t i; # for (i=0; i<n; i++) # y[i] = a * x[i] + y[i]; # } # # register arguments: # a0 n # fa0 a # a1 x # a2 y saxpy: vsetvli a4, a0, e32, m8, ta, ma vle32.v v0, (a1) sub a0, a0, a4 slli a4, a4, 2 add a1, a1, a4 vle32.v v8, (a2) vfmacc.vf v8, fa0, v0 vse32.v v8, (a2) add a2, a2, a4 bnez a0, saxpy ret
SGEMM example
# RV64IDV system # # void # sgemm_nn(size_t n, # size_t m, # size_t k, # const float*a, // m * k matrix # size_t lda, # const float*b, // k * n matrix # size_t ldb, # float*c, // m * n matrix # size_t ldc) # # c += a*b (alpha=1, no transpose on input matrices) # matrices stored in C row-major order #define n a0 #define m a1 #define k a2 #define ap a3 #define astride a4 #define bp a5 #define bstride a6 #define cp a7 #define cstride t0 #define kt t1 #define nt t2 #define bnp t3 #define cnp t4 #define akp t5 #define bkp s0 #define nvl s1 #define ccp s2 #define amp s3 # Use args as additional temporaries #define ft12 fa0 #define ft13 fa1 #define ft14 fa2 #define ft15 fa3 # This version holds a 16*VLMAX block of C matrix in vector registers # in inner loop, but otherwise does not cache or TLB tiling. sgemm_nn: addi sp, sp, -FRAMESIZE sd s0, OFFSET(sp) sd s1, OFFSET(sp) sd s2, OFFSET(sp) # Check for zero size matrices beqz n, exit beqz m, exit beqz k, exit # Convert elements strides to byte strides. ld cstride, OFFSET(sp) # Get arg from stack frame slli astride, astride, 2 slli bstride, bstride, 2 slli cstride, cstride, 2 slti t6, m, 16 bnez t6, end_rows c_row_loop: # Loop across rows of C blocks mv nt, n # Initialize n counter for next row of C blocks mv bnp, bp # Initialize B n-loop pointer to start mv cnp, cp # Initialize C n-loop pointer c_col_loop: # Loop across one row of C blocks vsetvli nvl, nt, e32, m1, ta, ma # 32-bit vectors, LMUL=1 mv akp, ap # reset pointer into A to beginning mv bkp, bnp # step to next column in B matrix # Initialize current C submatrix block from memory. vle32.v v0, (cnp); add ccp, cnp, cstride; vle32.v v1, (ccp); add ccp, ccp, cstride; vle32.v v2, (ccp); add ccp, ccp, cstride; vle32.v v3, (ccp); add ccp, ccp, cstride; vle32.v v4, (ccp); add ccp, ccp, cstride; vle32.v v5, (ccp); add ccp, ccp, cstride; vle32.v v6, (ccp); add ccp, ccp, cstride; vle32.v v7, (ccp); add ccp, ccp, cstride; vle32.v v8, (ccp); add ccp, ccp, cstride; vle32.v v9, (ccp); add ccp, ccp, cstride; vle32.v v10, (ccp); add ccp, ccp, cstride; vle32.v v11, (ccp); add ccp, ccp, cstride; vle32.v v12, (ccp); add ccp, ccp, cstride; vle32.v v13, (ccp); add ccp, ccp, cstride; vle32.v v14, (ccp); add ccp, ccp, cstride; vle32.v v15, (ccp) mv kt, k # Initialize inner loop counter # Inner loop scheduled assuming 4-clock occupancy of vfmacc instruction and single-issue pipeline # Software pipeline loads flw ft0, (akp); add amp, akp, astride; flw ft1, (amp); add amp, amp, astride; flw ft2, (amp); add amp, amp, astride; flw ft3, (amp); add amp, amp, astride; # Get vector from B matrix vle32.v v16, (bkp) # Loop on inner dimension for current C block k_loop: vfmacc.vf v0, ft0, v16 add bkp, bkp, bstride flw ft4, (amp) add amp, amp, astride vfmacc.vf v1, ft1, v16 addi kt, kt, -1 # Decrement k counter flw ft5, (amp) add amp, amp, astride vfmacc.vf v2, ft2, v16 flw ft6, (amp) add amp, amp, astride flw ft7, (amp) vfmacc.vf v3, ft3, v16 add amp, amp, astride flw ft8, (amp) add amp, amp, astride vfmacc.vf v4, ft4, v16 flw ft9, (amp) add amp, amp, astride vfmacc.vf v5, ft5, v16 flw ft10, (amp) add amp, amp, astride vfmacc.vf v6, ft6, v16 flw ft11, (amp) add amp, amp, astride vfmacc.vf v7, ft7, v16 flw ft12, (amp) add amp, amp, astride vfmacc.vf v8, ft8, v16 flw ft13, (amp) add amp, amp, astride vfmacc.vf v9, ft9, v16 flw ft14, (amp) add amp, amp, astride vfmacc.vf v10, ft10, v16 flw ft15, (amp) add amp, amp, astride addi akp, akp, 4 # Move to next column of a vfmacc.vf v11, ft11, v16 beqz kt, 1f # Don't load past end of matrix flw ft0, (akp) add amp, akp, astride 1: vfmacc.vf v12, ft12, v16 beqz kt, 1f flw ft1, (amp) add amp, amp, astride 1: vfmacc.vf v13, ft13, v16 beqz kt, 1f flw ft2, (amp) add amp, amp, astride 1: vfmacc.vf v14, ft14, v16 beqz kt, 1f # Exit out of loop flw ft3, (amp) add amp, amp, astride vfmacc.vf v15, ft15, v16 vle32.v v16, (bkp) # Get next vector from B matrix, overlap loads with jump stalls j k_loop 1: vfmacc.vf v15, ft15, v16 # Save C matrix block back to memory vse32.v v0, (cnp); add ccp, cnp, cstride; vse32.v v1, (ccp); add ccp, ccp, cstride; vse32.v v2, (ccp); add ccp, ccp, cstride; vse32.v v3, (ccp); add ccp, ccp, cstride; vse32.v v4, (ccp); add ccp, ccp, cstride; vse32.v v5, (ccp); add ccp, ccp, cstride; vse32.v v6, (ccp); add ccp, ccp, cstride; vse32.v v7, (ccp); add ccp, ccp, cstride; vse32.v v8, (ccp); add ccp, ccp, cstride; vse32.v v9, (ccp); add ccp, ccp, cstride; vse32.v v10, (ccp); add ccp, ccp, cstride; vse32.v v11, (ccp); add ccp, ccp, cstride; vse32.v v12, (ccp); add ccp, ccp, cstride; vse32.v v13, (ccp); add ccp, ccp, cstride; vse32.v v14, (ccp); add ccp, ccp, cstride; vse32.v v15, (ccp) # Following tail instructions should be scheduled earlier in free slots during C block save. # Leaving here for clarity. # Bump pointers for loop across blocks in one row slli t6, nvl, 2 add cnp, cnp, t6 # Move C block pointer over add bnp, bnp, t6 # Move B block pointer over sub nt, nt, nvl # Decrement element count in n dimension bnez nt, c_col_loop # Any more to do? # Move to next set of rows addi m, m, -16 # Did 16 rows above slli t6, astride, 4 # Multiply astride by 16 add ap, ap, t6 # Move A matrix pointer down 16 rows slli t6, cstride, 4 # Multiply cstride by 16 add cp, cp, t6 # Move C matrix pointer down 16 rows slti t6, m, 16 beqz t6, c_row_loop # Handle end of matrix with fewer than 16 rows. # Can use smaller versions of above decreasing in powers-of-2 depending on code-size concerns. end_rows: # Not done. exit: ld s0, OFFSET(sp) ld s1, OFFSET(sp) ld s2, OFFSET(sp) addi sp, sp, FRAMESIZE ret
Division approximation example
# v1 = v1 / v2 to almost 23 bits of precision. vfrec7.v v3, v2 # Estimate 1/v2 li t0, 0x3f800000 vmv.v.x v4, t0 # Splat 1.0 vfnmsac.vv v4, v2, v3 # 1.0 - v2 * est(1/v2) vfmadd.vv v3, v4, v3 # Better estimate of 1/v2 vmv.v.x v4, t0 # Splat 1.0 vfnmsac.vv v4, v2, v3 # 1.0 - v2 * est(1/v2) vfmadd.vv v3, v4, v3 # Better estimate of 1/v2 vfmul.vv v1, v1, v3 # Estimate of v1/v2
Square root approximation example
# v1 = sqrt(v1) to more than 23 bits of precision. fmv.w.x ft0, x0 # Mask off zero inputs vmfne.vf v0, v1, ft0 # to avoid DZ exception vfrsqrt7.v v2, v1, v0.t # Estimate r ~= 1/sqrt(v1) vmfne.vf v0, v2, ft0, v0.t # Mask off +inf to avoid NV li t0, 0x3f800000 fli.s ft0, 0.5 vmv.v.x v5, t0 # Splat 1.0 vfmul.vv v3, v1, v2, v0.t # t = v1 r vfmul.vf v4, v2, ft0, v0.t # 0.5 r vfmsub.vv v3, v2, v5, v0.t # t r - 1 vfnmsac.vv v2, v3, v4, v0.t # r - (0.5 r) (t r - 1) # Better estimate of 1/sqrt(v1) vfmul.vv v1, v1, v2, v0.t # t = v1 r vfmsub.vv v2, v1, v5, v0.t # t r - 1 vfmul.vf v3, v1, ft0, v0.t # 0.5 t vfnmsac.vv v1, v2, v3, v0.t # t - (0.5 t) (t r - 1) # ~ sqrt(v1) to about 23.3 bits
C standard library strcmp example
# int strcmp(const char *src1, const char* src2) strcmp: ## Using LMUL=2, but same register names work for larger LMULs li t1, 0 # Initial pointer bump loop: vsetvli t0, x0, e8, m2, ta, ma # Max length vectors of bytes add a0, a0, t1 # Bump src1 pointer vle8ff.v v8, (a0) # Get src1 bytes add a1, a1, t1 # Bump src2 pointer vle8ff.v v16, (a1) # Get src2 bytes vmseq.vi v0, v8, 0 # Flag zero bytes in src1 vmsne.vv v1, v8, v16 # Flag if src1 != src2 vmor.mm v0, v0, v1 # Combine exit conditions vfirst.m a2, v0 # ==0 or != ? csrr t1, vl # Get number of bytes fetched bltz a2, loop # Loop if all same and no zero byte add a0, a0, a2 # Get src1 element address lbu a3, (a0) # Get src1 byte from memory add a1, a1, a2 # Get src2 element address lbu a4, (a1) # Get src2 byte from memory sub a0, a3, a4 # Return value. ret
Fractional Lmul example
This appendix presents a non-normative example to help explain where compilers can make good use of the fractional LMUL feature.
Consider the following (admittedly contrived) loop written in C:
void add_ref(long N,
signed char *restrict c_c, signed char *restrict c_a, signed char *restrict c_b,
long *restrict l_c, long *restrict l_a, long *restrict l_b,
long *restrict l_d, long *restrict l_e, long *restrict l_f,
long *restrict l_g, long *restrict l_h, long *restrict l_i,
long *restrict l_j, long *restrict l_k, long *restrict l_l,
long *restrict l_m) {
long i;
for (i = 0; i < N; i++) {
c_c[i] = c_a[i] + c_b[i]; // Note this 'char' addition that creates a mixed type situation
l_c[i] = l_a[i] + l_b[i];
l_f[i] = l_d[i] + l_e[i];
l_i[i] = l_g[i] + l_h[i];
l_l[i] = l_k[i] + l_j[i];
l_m[i] += l_m[i] + l_c[i] + l_f[i] + l_i[i] + l_l[i];
}
}
The example loop has a high register pressure due to the many input variables and temporaries required. The compiler realizes there are two datatypes within the loop: an 8-bit 'char' and a 64-bit 'long *'. Without fractional LMUL, the compiler would be forced to use LMUL=1 for the 8-bit computation and LMUL=8 for the 64-bit computation(s), to have equal number of elements on all computations within the same loop iteration. Under LMUL=8, only 4 registers are available to the register allocator. Given the large number of 64-bit variables and temporaries required in this loop, the compiler ends up generating a lot of spill code. The code below demonstrates this effect:
.LBB0_4: # %vector.body # =>This Inner Loop Header: Depth=1 add s9, a2, s6 vsetvli s1, zero, e8,m1,ta,mu vle8.v v25, (s9) add s1, a3, s6 vle8.v v26, (s1) vadd.vv v25, v26, v25 add s1, a1, s6 vse8.v v25, (s1) add s9, a5, s10 vsetvli s1, zero, e64,m8,ta,mu vle64.v v8, (s9) add s1, a6, s10 vle64.v v16, (s1) add s1, a7, s10 vle64.v v24, (s1) add s1, s3, s10 vle64.v v0, (s1) sd a0, -112(s0) ld a0, -128(s0) vs8r.v v0, (a0) # Spill LMUL=8 add s9, t6, s10 add s11, t5, s10 add ra, t2, s10 add s1, t3, s10 vle64.v v0, (s9) ld s9, -136(s0) vs8r.v v0, (s9) # Spill LMUL=8 vle64.v v0, (s11) ld s9, -144(s0) vs8r.v v0, (s9) # Spill LMUL=8 vle64.v v0, (ra) ld s9, -160(s0) vs8r.v v0, (s9) # Spill LMUL=8 vle64.v v0, (s1) ld s1, -152(s0) vs8r.v v0, (s1) # Spill LMUL=8 vadd.vv v16, v16, v8 ld s1, -128(s0) vl8r.v v8, (s1) # Reload LMUL=8 vadd.vv v8, v8, v24 ld s1, -136(s0) vl8r.v v24, (s1) # Reload LMUL=8 ld s1, -144(s0) vl8r.v v0, (s1) # Reload LMUL=8 vadd.vv v24, v0, v24 ld s1, -128(s0) vs8r.v v24, (s1) # Spill LMUL=8 ld s1, -152(s0) vl8r.v v0, (s1) # Reload LMUL=8 ld s1, -160(s0) vl8r.v v24, (s1) # Reload LMUL=8 vadd.vv v0, v0, v24 add s1, a4, s10 vse64.v v16, (s1) add s1, s2, s10 vse64.v v8, (s1) vadd.vv v8, v8, v16 add s1, t4, s10 ld s9, -128(s0) vl8r.v v16, (s9) # Reload LMUL=8 vse64.v v16, (s1) add s9, t0, s10 vadd.vv v8, v8, v16 vle64.v v16, (s9) add s1, t1, s10 vse64.v v0, (s1) vadd.vv v8, v8, v0 vsll.vi v16, v16, 1 vadd.vv v8, v8, v16 vse64.v v8, (s9) add s6, s6, s7 add s10, s10, s8 bne s6, s4, .LBB0_4
If instead of using LMUL=1 for the 8-bit computation, the compiler is allowed to use a fractional LMUL=1/2, then the 64-bit computations can be performed using LMUL=4 (note that the same ratio of 64-bit elements and 8-bit elements is preserved as in the previous example). Now the compiler has 8 available registers to perform register allocation, resulting in no spill code, as shown in the loop below:
.LBB0_4: # %vector.body # =>This Inner Loop Header: Depth=1 add s9, a2, s6 vsetvli s1, zero, e8,mf2,ta,mu // LMUL=1/2 ! vle8.v v25, (s9) add s1, a3, s6 vle8.v v26, (s1) vadd.vv v25, v26, v25 add s1, a1, s6 vse8.v v25, (s1) add s9, a5, s10 vsetvli s1, zero, e64,m4,ta,mu // LMUL=4 vle64.v v28, (s9) add s1, a6, s10 vle64.v v8, (s1) vadd.vv v28, v8, v28 add s1, a7, s10 vle64.v v8, (s1) add s1, s3, s10 vle64.v v12, (s1) add s1, t6, s10 vle64.v v16, (s1) add s1, t5, s10 vle64.v v20, (s1) add s1, a4, s10 vse64.v v28, (s1) vadd.vv v8, v12, v8 vadd.vv v12, v20, v16 add s1, t2, s10 vle64.v v16, (s1) add s1, t3, s10 vle64.v v20, (s1) add s1, s2, s10 vse64.v v8, (s1) add s9, t4, s10 vadd.vv v16, v20, v16 add s11, t0, s10 vle64.v v20, (s11) vse64.v v12, (s9) add s1, t1, s10 vse64.v v16, (s1) vsll.vi v20, v20, 1 vadd.vv v28, v8, v28 vadd.vv v28, v28, v12 vadd.vv v28, v28, v16 vadd.vv v28, v28, v20 vse64.v v28, (s11) add s6, s6, s7 add s10, s10, s8 bne s6, s4, .LBB0_4