for (i = 0, j = 0; i < (128 / TBITS); i++, j += 2) { \
int##TBITS##_t a = (int##BITS##_t)s390_vec_read_element##BITS(v2, j); \
int##TBITS##_t b = (int##BITS##_t)s390_vec_read_element##BITS(v3, j); \
- int##TBITS##_t c = (int##BITS##_t)s390_vec_read_element##BITS(v4, j); \
+ int##TBITS##_t c = s390_vec_read_element##TBITS(v4, i); \
\
s390_vec_write_element##TBITS(v1, i, a * b + c); \
} \
for (i = 0, j = 0; i < (128 / TBITS); i++, j += 2) { \
uint##TBITS##_t a = s390_vec_read_element##BITS(v2, j); \
uint##TBITS##_t b = s390_vec_read_element##BITS(v3, j); \
- uint##TBITS##_t c = s390_vec_read_element##BITS(v4, j); \
+ uint##TBITS##_t c = s390_vec_read_element##TBITS(v4, i); \
\
s390_vec_write_element##TBITS(v1, i, a * b + c); \
} \
for (i = 0, j = 1; i < (128 / TBITS); i++, j += 2) { \
int##TBITS##_t a = (int##BITS##_t)s390_vec_read_element##BITS(v2, j); \
int##TBITS##_t b = (int##BITS##_t)s390_vec_read_element##BITS(v3, j); \
- int##TBITS##_t c = (int##BITS##_t)s390_vec_read_element##BITS(v4, j); \
+ int##TBITS##_t c = s390_vec_read_element##TBITS(v4, i); \
\
s390_vec_write_element##TBITS(v1, i, a * b + c); \
} \
for (i = 0, j = 1; i < (128 / TBITS); i++, j += 2) { \
uint##TBITS##_t a = s390_vec_read_element##BITS(v2, j); \
uint##TBITS##_t b = s390_vec_read_element##BITS(v3, j); \
- uint##TBITS##_t c = s390_vec_read_element##BITS(v4, j); \
+ uint##TBITS##_t c = s390_vec_read_element##TBITS(v4, i); \
\
s390_vec_write_element##TBITS(v1, i, a * b + c); \
} \