内联指令 int _abs (int src); int _labs (__int40_t src) 汇编指令 ABS 简要描述 返回src的绝对值 图示 int _add2 (int src1, int src2) ADD2 把src1的高、低16位和src2的高、低16位分别相加,放入结果的高、低16位 ushort &_amem2 (void *ptr); LDHU STHU 从内存中加载一个halfword到dst里,必须2byte对齐(读或存) const ushort &_amem2_const (const void *ptr); unsigned &_amem4 (void *ptr); LDHU LDW STW 必须2byte对齐(读) 必须4byte对齐(读 或存) 必须4byte对齐(读) const unsigned &_amem4_const (const void *ptr); LDW double &_amemd8 (void *ptr); LDW/LDW STW/STW const double &_amemd8_const (const void *ptr); unsigned _clr (unsigned src2, unsigned csta,unsigned cstb); LDDW CLR 必须8byte对齐(读 或存) 必须8byte对齐(读) 指定了从需要清0的首位和末位 unsigned _clrr (unsigned src2, int src1); CLR __int40_t _dtol (double src); 将src2中指定位清0, 清0的首位和末位由src1的低10位指定 将一个double寄存器 重新解释成一个__int40_t long long _dtoll (double src); 将一个double寄存器 重新解释成一个long long int _ext (int src2, unsigned csta, unsigned cstb); EXT 从src2里提取csta和cstb指定的区域且符号扩展到32位。提取出的区域先符号左移再右移。 int _extr (int src2, int src1); EXT 同上,区别:左右移的位数由src1的低10位指定 同上上,区别最后是0扩展到32位。 unsigned _extu (unsigned src2, unsigned csta , unsigned cstb); EXTU unsigned _extur (unsigned src2, int src1); EXTU 同上,区别:左右移的位数由src1的低10位指定 例: _ftoi (1.0) == 1065353216U unsigned _ftoi (float src); 将float的比特位解 释成unsigned unsigned _hi (double src); unsigned _hill (long long src); 返回double寄存器的 高位(奇数位) 返回longlong寄存器 的高位(奇数位) 创建一个新的double 寄存器为了解释2个unsigned的值,其中src2是高(奇数)寄存器,src1是低(偶数)寄存器 将unsigned中的比特例: _itof (0x3f800000) = 1.0 位解释成float 创建一个新的 longlong寄存器为了解释2个unsigned的值,其中src2是高(奇数)寄存器,src1是低(偶数) double _itod (unsigned src2, unsigned src1); float _itof (unsigned src); long long _itoll (unsigned src2, unsigned src1); unsigned _lmbd (unsigned src1, unsigned src2); LMBD 搜索src2里面的1或0,1或0是由src1的LSB决定的,返回比特位变化的位数 unsigned _lo (double src); 返回double寄存器的 低(奇数)寄存器 返回longlong寄存器 unsigned _loll (long long src); 的低(奇数)寄存器 double _ltod (__int40_t src); 把一个__int40_t寄存器解释成一个double寄存器 把一个longlong寄存器解释成一个double寄存器 Src1和src2相乘,操作数默认为有符号的 double _lltod (long long src); int _mpy (int src1, int src2); MPY int _mpyus (unsigned src1, int src2); MPYUS int _mpysu (int src1, unsigned src2); unsigned _mpyu (unsigned src1, unsigned src2); MPYSU MPYU 无符号src1和有符号 src2相乘,S是用来那个是有符号的操作数,当两个操作数都是有符号的或者无符号的 同上 同上上上,默认为无符号 同上,区别见图示 同上,区别见图示 int _mpyh (int src1, int src2); MPYH int _mpyhus (unsigned src1, int src2); int _mpyhsu (int src1, unsigned src2); unsigned _mpyhu (unsigned src1, unsigned src2); int _mpyhl (int src1, int src2); MPYHUS MPYHSU MPYHU MPYHL int _mpyhuls (unsigned src1, int src2); MPYHULS int _mpyhslu (int src1, unsigned src2); unsigned _mpyhlu (unsigned src1, unsigned src2); int _mpylh (int src1, int src2); int _mpyluhs (unsigned src1, int src2); int _mpylshu (int src1, unsigned src2); unsigned _mpylhu (unsigned src1, unsigned src2); void _nassert (int src); MPYHSLU MPYHLU MPYLH MPYLUHS MPYLSHU MPYLHU 不生成代码,告诉优化器一些事情 返回src2的冗余的符号比特位的个数,具体见图示 NORM unsigned _norm (int src); unsigned _lnorm (__int40_t src); int _sadd (int src1, int src2); long _lsadd (int src1, __int40_t src2); SADD 将src1和src2相加,且饱和其结果 int _sat (__int40_t src2); SAT 将一个40比特的long转换为一个32比特的有符号int,如有需要,对结果进行饱和 unsigned _set (unsigned src2, unsigned csta , unsigned cstb); SET 将src2中指定的区域置位1,指定的区域由csta和cstb指定 unsigned _setr (unit src2, int src1); int _smpy (int src1, int src2); SET SMPY 把src1的低16位和src2的低16位相乘 int _smpyh (int src1, int src2); SMPYH 高16位 int _smpyhl (int src1, int src2); SMPYHL int _smpylh (int src1, int src2); SMPYLH int _sshl (int src2, unsigned src1); SSHL 以src1操作数将src2左移,并且将结果饱和在32位 从src1中减去src2,并饱和结果(src1-src2) int _ssub (int src1, int src2); __int40_t _lssub (int src1, __int40_t src2); SSUB unsigned _subc (unsigned src1, unsigned src2); SUBC 有条件的减和左移(常用于除法) 把src1的高低16位分别减去src2的高低16位。任何低16位的借位不会影响高16位。 int _sub2 (int src1, int src2); SUB2 int _abs2 (int src); ABS2 计算16位的绝对值 int _add4 (int src1, int src2); ADD4 把src1和src2的4对8位数相加。不会进行饱和,进位不会影响其他的8位数 long long &_amem8 (void *ptr); LDDW STDW 加载和存储8bytes,指针必须8byte对齐 const long long &_amem8_const (const void *ptr); LDDW __float2_t &_amem8_f2(void * ptr); LDDW STDW 加载8bytes,指针必 须8byte对齐 加载和存储8bytes, 指针必须8byte对齐,必须包含c6x.h 加载8bytes,指针必 const __float2_t &_amem8_f2_const(void * ptr); LDDW double &_amemd8 (void *ptr); LDDW STDW 须8byte对齐,必须包含c6x.h 计算每对有符号16位置的平均值 const double &_amemd8_const (const void *ptr); int _avg2 (int src1, int src2); LDDW AVG2 unsigned _avgu4 (unsigned, unsigned); AVGU4 计算每对有符号8位数的平均值 unsigned _bitc4 (unsigned src); BITC4 统计每个8位的比特位是1的个数,写入结果对应位置 unsigned _bitr (unsigned src); BITR 翻转比特位的顺序 int _cmpeq2 (int src1, int src2); CMPEQ2 比较每16位的值是否相等,结果放入dst的最低2位 int _cmpeq4 (int src1, int src2); CMPEQ4 比较每8位的值是否相等,结果放入dst的最低4位,相等置1,否则为0 int _cmpgt2 (int src1, int src2); CMPGT2 每16位有符号比较,src1>src2,置为1;否则置为0。结果放入dst的最低2位 unsigned _cmpgtu4 (unsigned src1, unsigned src2); CMPGTU4 每8位无符号比较,src1>src2,置为1;否则置为0。结果放入dst的最低4位 unsigned _deal (unsigned src ); DEAL 将src中的比特位的奇数位和偶数位抽出来进行重组,偶数位放在低的16位,奇数位放在高的16位 int _dotp2 (int src1, int src2); __int40_t _ldotp2 (int src1, int src2); DOTP2 DOTP2 将src1中的和src2中的16位有符号对进行点积,结果被写成有符号32位int或者符号扩展为64位 int _dotpn2 (int src1, int src2); DOTPN2 将src1和src2中的16位有符号数进行点积相减 int _dotpnrsu2 (int src1, unsigned src2); DOTPNRSU2 Src1和src2的高16位的点积减去低16位的点积。Src1中的数被当做有符号,src2中的数被当做无符号,再加上2^15,结果再符号右移16位 Src1和src2的高16位的点积加上低16位的点积。Src1中的数被当做有符号,src2中的数被当做无符号,再加上2^15,结果再符号右移16位 int _dotprsu2 (int src1, unsigned src2); DOTPRSU2 int _dotpsu4 (int src1, unsigned src2); DOTPSU4 将src1和src2的每8位进行相乘再求和,src1的每8位数被当做有符号,src2的每8位数被当做无符号 unsigned _dotpu4 (unsigned src1, unsigned src2); DOTPU4 都被当做无符号的 int _gmpy4 (int src1, int src2); GMPY4 将src1和src2的4个无符号进行伽罗瓦域的乘法 int _max2 (int src1, int src2); MAX2 将src1和src2的2个有符号16位整数比较,取较大值 int _min2 (int src1, int src2); MIN2 将src1和src2的2个有符号16位整数比较,取较小值 unsigned _maxu4 (unsigned src1, unsigned src2); MAXU4 将src1和src2的4个无符号8位整数比较,取较大值 unsigned _minu4 (unsigned src1, unsigned src2); MINU4 将src1和src2的4个无符号8位整数比较,取较小值 ushort &_mem2 (void * ptr); LDB/LDB STB/STB const ushort &_mem2_const (const void * ptr); LDB/LDB 加载和存储2byte,不需要对齐 加载2byte,不需要对 齐 加载和存储4byte,不需要对齐 unsigned &_mem4 (void * ptr); LDNW STNW const unsigned &_mem4_const (const void * ptr); LDNW long long &_mem8 (void * ptr); LDNDW STNDW 加载4byte,不需要对 齐 加载和存储8byte,不 需要对齐 加载8byte,不需要对 齐 加载和存储8byte,不 需要对齐 加载8byte,不需要对 齐 const long long &_mem8_const (const void * ptr); LDNDW double &_memd8 (void * ptr); LDNDW STNDW const double &_memd8_const (const void * ptr); LDNDW long long _mpy2ll (int src1, int src2); MPY2 将src1和src2中的2个有符号16位分别相乘,将2个32位的结果写入longlong中 long long _mpyhill (int src1, int src2); MPYHI 将src1中高16位作为1个有符号16位乘以src2的有符号32位,结果写入longlong的低48位 将src1中低16位作为1个有符号16位乘以src2的有符号32位,结果写入longlong的低48位 将src1的高16位作为一个16位有符号乘以src2的有符号32位。乘积利用round模式通过加2^14转成32位,最后再右移15位 long long _mpylill (int src1, int src2); MPYLI int _mpyhir (int src1, int src2); MPYHIR int _mpylir (int src1, int src2); MPYLIR 将src1的低16位作为一个16位有符号乘以src2的有符号32位。乘积利用round模式通过加2^14转成32位,最后再右移15位 将src1的4个8位有符号乘src2的4个8位无符号,得到4个16位有符号,组成一个64位 long long _mpysu4ll (int src1, unsigned src2); MPYSU4 long long _mpyu4ll (unsigned src1, unsigned src2); MPYU4 将src1和src2的4个无符号8位相乘,得到4个无符号16位组成一个64位的数 int _mvd (int src2 ); MVD 将src2的数据移入返 unsigned _pack2 (unsigned src1, unsigned src2); PACK2 回值中,利用了乘法流水线(延迟) unsigned _packh2 (unsigned src1, unsigned src2); PACKH2 unsigned _packh4 (unsigned src1, unsigned src2); PACKH4 unsigned _packl4 (unsigned src1, unsigned src2); PACKL4 unsigned _packhl2 (unsigned src1, unsigned src2); PACKHL2 unsigned _packlh2 (unsigned src1, unsigned src2); PACKLH2 unsigned _rotl (unsigned src1, unsigned src2); ROTL 按照src1的最低5位的数去左移src2的32位,src1中剩下的高的5-31位被忽略 int _sadd2 (int src1, int src2); SADD2 将src1和src2中的2个16位有符号数相加,生成2个16有符号数并且是饱和过的。 int _saddus2 (unsigned src1, int src2); SADDUS2 将src1中的2个无符号16位数和src中的2个16位有符号数相加,得到2个无符号16位数 unsigned _saddu4 (unsigned src1, unsigned src2); SADDU4 将src1和src2中的4个无符号8位数相加 unsigned _shfl (unsigned src2); SHFL 将src2的高16和低16位进行交织 unsigned _shlmb (unsigned src1, unsigned src2); SHLMB 将src2左移1byte,然后将src1的最高位充入src2左移后多出来的位置 unsigned _shrmb (unsigned src1, unsigned src2); SHRMB 将src2右移1byte,然后将src1的最低位充入src2右移后多出来的位置 int _shr2 (int src1, unsigned src2); SHR2 将src2的2个16位有符号数分别右移,右移的位数由src1的低5位决定,多出的位置由符号位扩展 unsigned shru2 (unsigned src1, unsigned src2); SHRU2 将src2的2个16位无符号数分别右移,右移的位数由src1的低5位决定,多出的位置由0扩展 long long _smpy2ll (int src1, int src2); SMPY2 将src1和src2中的2个有符号16位数相乘,然后左移1位,再进行饱和。 int _spack2 (int src1, int src2); SPACK2 将src1和src2中的1个有符号32位数进行饱和到有符号16位,然后把src1的饱和结果放入dst的高16位,src2的饱和结果放入dst的低16位 将src1和src2中的4个有符号16位数饱和成无符号8位数, unsigned _spacku4 (int src1 , int src2); SPACKU4 int _sshvl (int src2, int src1); SSHVL 将src2中的有符号32位数左移或右移,移位的数量由src1指定的比特数确定。 src1在[-31,31]之间, 如果src1为正,src2则左移;如果src1为负,src2右移|src1|且符号位扩展 int _sshvr (int src2, int src1); SSHVR 将src2中的有符号32位数左移或右移,移位的数量由src1指定的比特数确定。 src1在[-31,31]之间,如果src1为正,src2则右移且是符号扩展;如果src1为负,src2左移|src1| int _sub4 (int src1, int src2); SUB4 将src1和src2中的4个8位数相减,不进行饱和 int _subabs4 (int src1, int src2); SUBABS4 将src1和src2中的4个无符号8位相减求绝对值 unsigned _swap4 (unsigned src); SWAP4 将src的4个8位无符号数按图示换位置 unsigned _unpkhu4 (unsigned src); UNPKHU4 扩展0 unsigned _unpklu4 (unsigned src); UNPKLU4 扩0 unsigned _xpnd2 (unsigned src); XPND2 按src的最低2位进行扩展,bit1扩展高16位,bit0扩展低16位 unsigned _xpnd4 (unsigned src); XPND4 按src的最低4位进行扩展 long long _addsub (int src1, int src2); ADDSUB 平行做2步: 1、src2+src1->dst_o 2、src1-src2->dst_e long long _addsub2 (int src1, int src2); ADDSUB2 16位有符号 ADD2:src2的高、低16位+src1的高、低16位->dst_o SUB2: src1的高、低16位-src2的高、低16位->dst_e 有符号16位 Src1和src2的高16位的点积-src1和src2的低16位点积->dst_o 饱和(src1和src2的高16位的点积+src1和src2的低16位点积)->dst_e long long _cmpy (unsigned src1, unsigned src2); CMPY unsigned _cmpyr (unsigned src1, unsigned src2); CMPYR 没有饱和 unsigned _cmpyr1 (unsigned src1, unsigned src2 ); CMPYR1 long long _ddotp4 (unsigned src1, unsigned src2); DDOTP4 long long _ddotph2 (long long src1, unsigned src2); DDOTPH2 long long _ddotpl2 (long long src1, unsigned src2); DDOTPL2 unsigned _ddotph2r (long long src1, unsigned src2); DDOTPH2R unsigned _ddotpl2r (long long src1, unsigned src2); DDOTPL2R long long _dmv (int src1, int src2); DMV 将两个寄存器移入一个寄存器一次性的 long long _dpack2 (unsigned src1, unsigned src2); DPACK2 long long _dpackx2 (unsigned src1, unsigned src2); DPACKX2 __float2_t _fmdv_f2(float src1, float src2) unsigned _gmpy (unsigned src1, unsigned src2); long long _mpy2ir (int src1, int src2); DMV GMPY MPY2IR 伽罗瓦域上的乘法 进行16位乘32位。 将src1的高16位和低16位当做有符号16位;将src2的值当做有符号32位。 乘积通过加上2^14round到32位,然后结果右移15位。 2个结果的低32位写入dst_o:dst_e int _mpy32 (int src1, int src2); MPY32 进行32位乘32位。都是有符号的,64位结果中的低32位写入dst 32位有符号数×32位有符号数,有符号的64位结果被写入dst src1有符号32位×src2无符号32位=dst有符号64位 src1无符号32位×src2有符号32位=dst有符号64位 src1无符号32位×src2无符号32位=dst无符号64位 long long _mpy32ll (int src1, int src2); MPY32 long long _mpy32su (int src1, int src2); MPY32SU long long _mpy32us (unsigned src1, int src2); MPY32US long long _mpy32u (unsigned src1, unsigned src2); MPY32U int _rpack2 (int src1, int src2); RPACK2 long long _saddsub (unsigned src1, unsigned src2); SADDSUB 并行进行: 1、饱和(src1+src2)->dst_o 2、饱和(src1-src2)->dst_e long long _saddsub2 (unsigned src1, unsigned src2); long long _shfl3 (unsigned src1, unsigned src2); SHFL3 SADDSUB2 并行进行SADD2和SSUB2指令 如图,生成一个longlong int _smpy32 (int src1, int src2); SMPY32 32位有符号×32位有符号,64位的结果左移1位然后饱和,然后将之后的结果的高32位写入dst Src1中的2个16位有符号-src2中的2个有符号16位,结果进行饱和 int _ssub2 (unsigned src1, unsigned src2); SSUB2 unsigned _xormpy (unsigned src1, unsigned src2); int _dpint (double src); XORMPY DPINT 加瓦罗域乘法 将double转成int(round) 将一个__float2_t解释成一个__int40 将一个__float2_t解释成一个longlong 将src的绝对值放入dst。 __int40_t _f2tol(__float2_t src); ABSDP __float2_t _f2toll(__float2_t src); double _fabs (double src); float _fabsf (float src); ABSSP __float2_t _lltof2(long long src); LDNDW STNDW 将一个longlong解释成一个__float2_t __float2_t _ltof2(__int40_t src); 将一个__int40解释成 一个__float2_t 从内存里加载一个64位值 Src1×src2->dst __float2_t &_mem8_f2(void * ptr); const __float2_t &_mem8_f2_const(void * ptr); LDNDW STNDW long long _mpyidll (int src1, int src2); MPYID double_mpysp2dp (float src1, float src2); double_mpyspdp (float src1, double src2); double _rcpdp (double src); MPYSP2DP MPYSPDP RCPDP Src1×src2->dst Src1×src2->dst float _rcpsp (float src); RCPSP 64位double倒数近 似值放入dst 32位float的倒数近 似值 64位double的平方 根倒数近似值 32位float的平方根 倒数近似值 Float转为int 2个double相加 2个float相加 位与 与后取反 double _rsqrdp (double src); RSQRDP float _rsqrsp (float src); RSQRSP int _spint (float); SPINT ADDDP ADDSP AND ANDN __x128_t _ccmatmpy (long long src1, __x128_t src2); MPYSP OR SUBDP SUBSP XOR CMATMPY 2个float相乘 位或 2个double相减 2和float相减 异或 long long _ccmatmpyr1 (long long src1, __x128_t src2); CCMATMPYR1 long long _ccmpy32r1 (long long src1, long long src2); CCMPY32R1 __x128_t _cmatmpy (long long src1, __x128_t src2); CMATMPY long long _cmatmpyr1 (long long src1, __x128_t src2); CMATMPYR1 long long _cmpy32r1 (long long src1, long long src2); __x128_t _cmpysp (__float2_t src1, __float2_t src2); double _complex_conjugate_mpysp (double src1, double src2); double _complex_mpysp (double src1, double src2); int _crot90 (int src); int _crot270 (int src); CMPYSP DSUBSP CMPYSP DADDSP CROT90 CROT270 CMPYSP CMPY32R1 复数的90度旋转 复数的270度旋转 long long _dadd (long long src1, long long src2); DADD Src1的2个32位有符号数+src2的2个32位有符号数 long long _dadd2 (long long src1, long long src2); DADD2 4路有符号16位相加 __float2_t _daddsp (__float2_t src1, __float2_t src2); DADDSP long long _dadd_c (scst5 immediate src1, long long src2); DADD 2路float加法 long long _dapys2 (long long src1, long long src2); long long _davg2 (long long src1, long long src2); DAPYS2 DAVG2 有符号16位
long long _davgnr2 (long long src1, long long src2); DAVGNR2 有符号16位,无round模式
long long _davgnru4 (long long src1, long long src2); DAVGNRU4 无符号8位,无round模式 long long _davgu4 (long long src1, long long src2); DAVGU4 无符号8位 long long _dccmpyr1 (long long src1, long long src2); unsigned _dcmpeq2 (long long src1, long long src2); DCCMPYR1 16位比较,相等返回1,不等返回0 DCMPEQ2 unsigned _dcmpeq4 (long long src1, long long src2); DCMPEQ4 8位比较,相等返回1,不等返回0 unsigned _dcmpgt2 (long long src1, long long src2); DCMPGT2 16位比较,src1>src->1,否则返回0 unsigned _dcmpgtu4 (long long src1, long long src2); DCMPGTU4 8位比较,src1>src->1,否则返回0 __x128_t _dccmpy (long long src1, long long src2); __x128_t _dcmpy (long long src1, long long src2); long long _dcmpyr1 (long long src1, long long src2); long long _dcrot90 (long long src); long long _dcrot270 (long long src); DCCMPY DCMPY DCMPYR1 DCROT90 DCROT270 long long _ddotp4h (__x128_t src1, __x128_t src2 ); DDOTP4H 执行2个dotp4h,都是有符号的 long long _ddotpsu4h (__x128_t src1, __x128_t src2 ); DDOTPSU4H 执行2个dotpsu4h,一个有符号,一个无符号 __float2_t _dinthsp (int src); DINTHSP Src中的16位有符号 数转成单精度浮点放入dst_e和dst_o中 Src中的16位无符号 数转成单精度浮点放入dst_e和dst_o中 Src中的有符号32位 转成单精度浮点,放入dst_e和dst_o中 __float2_t _dinthspu (unsigned src); DINTHSPU __float2_t _dintsp(long long src); DINTSP __float2_t _dintspu(long long src); DINTSPU Src中的无符号32位转成单精度浮点,放入dst_e和dst_o中 对src1和src2中的16位有符号数比大小,将大的放入dst中 long long _dmax2 (long long src1, long long src2); DMAX2 long long _dmaxu4 (long long src1, long long src2); DMAXU4 对src1和src2中的8位有符号数比大小,将大的放入dst中 对src1和src2中的16位有符号数比大小,将小的放入dst中 对src1和src2中的8位有符号数比大小,将小的放入dst中 将src1和src2中的16位有符号数相乘,得到32位有符号数放入128位寄存器中 long long _dmin2 (long long src1, long long src2); DMIN2 long long _dminu4 (long long src1, long long src2); DMINU4 __x128_t _dmpy2 (long long src1, long long src2); DMPY2 __float2_t _dmpysp (__float2_t src1, __float2_t src2); __x128_t _dmpysu4 (long long src1, long long src2); DMPYSP DMPYSU4 将src1中的8位有符号数乘以src2中的无符号8位,等到有符号16位 16位无符号数相乘, 得到32位数放入128位寄存器中 __x128_t _dmpyu2 (long long src1, long long src2); DMPYU2 __x128_t _dmpyu4 (long long src1, long long src2); DMPYU4 8位无符号数相乘,得到有符号16位结果 long long _dmvd (long long src1, unsigned src2 ); DMVD 将2个寄存器移入一 个寄存器中。依次进行2次移动,当处理很多的double word时很有用。减轻寄存器压力 进行两个系列的16位值的点积 返回值不同 Src1中被当做有符号16位,src2被当做无符号16位,得到32位结果 Src1中被当做有符号 16位,src2被当做无符号16位,得到64位结果 int _dotp4h (long long src1, long long src2 ); DOTP4H long long _dotp4hll (long long src1, long long src2 ); int _dotpsu4h (long long src1, long long src2); DOTP4H DOTPSU4H long long _dotspu4hll (long long src1, long long src2); DOTPSU4H long long _dpackh2 (long long src1, long long src2); DPACKH2 long long _dpackh4 (long long src1, long long src2); long long _dpacklh2 (long long src1, long long src2); DPACKH4 DPACKLH2 并行执行2个PACKH4 long long _dpacklh4 (unsigned src1, unsigned src2); DPACKLH4 并行执行PACKH4和PACKL4 long long _dpackl2 (long long src1, long long src2); DPACKL2 long long _dpackl4 (long long src1, long long src2); DPACKL4 并行执行2个PACKL4 long long _dsadd (long long src1, long long src2); DSADD 将src1中的2个有符号32位数加上src2中的2个有符号32位数,结果进行饱和 结果饱和到[-2^15 2^15] long long _dsadd2 (long long src1, long long src2); DSADD2 long long _dshl (long long src1, unsigned src2); DSHL 将longlong中的2个32位左移,用0补位(有符号32位) long long _dshl2 (long long src1, unsigned src2); DSHL2 将longlong中的4个16位左移,用0补位(有符号16位) long long _dshr (long long src1, unsigned src2); DSHR 右移,符号位补位(有符号32位) long long _dshr2 (long long src1, unsigned src2); DSHR2 右移,符号位补位(有符号16位) long long _dshru (long long src1, unsigned src2); DSHRU 右移,0补位(无符号32位) long long _dshru2 (long long src1, unsigned src2); DSHRU2 右移,0补位(无符号16位) __x128_t _dsmpy2 (long long src1, long long src2); DSMPY2 见图示 long long _dspacku4 (long long src1, long long src2); long long _dspint (__float2_t src); DSPACKU4 并行进行2个SPACK4 将src中的2个单精 度数转成2个整型 将src_e和src_o的两个单精度浮点数转陈高个有符号的16位整数 DSPINT unsigned _dspinth (__float2_t src); DSPINTH long long _dssub (long long src1, long long src2); DSSUB 将src1中的2个32位有符号数减src2中的2个32位有符号数,得到的结果进行饱和[-2^31 (2^31)-1] long long _dssub2 (long long src1, long long src2); DSSUB2 4个16位有符号数相减,结果进行饱和[-2^15 (2^15)-1] long long _dsub (long long src1, long long src2); DSUB 不饱和 long long _dsub2 (long long src1, long long src2); DSUB2 不饱和 __float2_t _dsubsp (__float2_t src1, __float2_t src2); long long _dxpnd2 (unsigned src); DXPND2 DSUBSP 32位单精度数相减 long long _dxpnd4 (unsigned src); DXPND4 __float2_t _fdmvd_f2(float src1, float src2); DMVD 见MVD int _land (int src1, int src2); LAND 逻辑与 int _landn (int src1, int src2); LANDN int _lor (int src1, int src2); LOR 逻辑或 void _mfence(); MFENCE 延迟取指令流水线一直到内存系统的busy标志降低 double_mpysp2dp (float src1, float src2); MPYSP2DP 将2个float相乘得到 1个double结果 double_mpyspdp (float src1, double src2); MPYSPDP 1个float×1个double得到1个double 2个无符号16位数×2个无符号16位数得到2个无符号32位数 long long _mpyu2 (unsigned src1, unsigned src2 ); MPYU2 __x128_t _qmpy32 (__x128_t src1, __x128_t src2); QMPY32 4路:32位有符号×32位有符号,结果的低32位放入dst __x128_t _qmpysp (__x128_t src1, __x128_t src2); QMPYSP __x128_t _qsmpy32r1 (__x128_t src1, __x128_t src2); QSMPY32R1 4路:有符号32位×有符号32位,得到32位。和QMOY32的区别是饱和round unsigned _shl2 (unsigned src1, unsigned src2); SHL2 2个有符号16位,左移。Src2的低4位是移动的位数。结果也是当做有符号16位 long long _unpkbu4 (unsigned src); UNPKBU4 将无符号8位扩成无符号16位 long long _unpkh2 (unsigned src); UNPKH2 有符号16位符号扩展 long long _unpkhu2 (unsigned src); UNPKHU2 无符号16位进行0扩展 long long _xorll_c (scst5 immediate src1, long long src2); XOR 逻辑异或