8 bitli MCU uchun tezroq 16bit ko'paytirish algoritmi

Men quyidagi algoritmni izlayapman, bu ikkita raqamni quyidagi raqamdan yaxshiroq ko'paytirish uchun. Bu borada yaxshi fikringiz bormi? (MCU - AT Tiny 84/85 yoki shunga o'xshash - bu kod ishlaydigan mul/div operatori yo'q)

uint16_t umul16_(uint16_t a, uint16_t b)
{
    uint16_t res=0;

    while (b) {
        if ( (b & 1) )
            res+=a;
        b>>=1;
        a+=a;
    }

    return res;
}

Ushbu algoritm AT-Tiny 85/84 uchun avr-gcc kompilyatoridan foydalanib to'planganda, avr-gcc algoritmining __mulhi3 bilan deyarli bir xil bo'ladi.

avr-gcc algoritmi:

00000106 <__mulhi3>:
 106:   00 24           eor r0, r0
 108:   55 27           eor r21, r21
 10a:   04 c0           rjmp    .+8         ; 0x114 <__mulhi3+0xe>
 10c:   08 0e           add r0, r24
 10e:   59 1f           adc r21, r25
 110:   88 0f           add r24, r24
 112:   99 1f           adc r25, r25
 114:   00 97           sbiw    r24, 0x00   ; 0
 116:   29 f0           breq    .+10        ; 0x122 <__mulhi3+0x1c>
 118:   76 95           lsr r23
 11a:   67 95           ror r22
 11c:   b8 f3           brcs    .-18        ; 0x10c <__mulhi3+0x6>
 11e:   71 05           cpc r23, r1
 120:   b9 f7           brne    .-18        ; 0x110 <__mulhi3+0xa>
 122:   80 2d           mov r24, r0
 124:   95 2f           mov r25, r21
 126:   08 95           ret

umul16_ algoritmi:

00000044 :
  44:   20 e0           ldi r18, 0x00   ; 0
  46:   30 e0           ldi r19, 0x00   ; 0
  48:   61 15           cp  r22, r1
  4a:   71 05           cpc r23, r1
  4c:   49 f0           breq    .+18        ; 0x60 
  4e:   60 ff           sbrs    r22, 0
  50:   02 c0           rjmp    .+4         ; 0x56 
  52:   28 0f           add r18, r24
  54:   39 1f           adc r19, r25
  56:   76 95           lsr r23
  58:   67 95           ror r22
  5a:   88 0f           add r24, r24
  5c:   99 1f           adc r25, r25
  5e:   f4 cf           rjmp    .-24        ; 0x48 
  60:   c9 01           movw    r24, r18
  62:   08 95           ret

Tahrirlash: The instruction set is available bu yerda.
21
Siz ham xuddi shu (to'g'ri) natija hosil qilganligini tasdiqladingizmi?
qo'shib qo'ydi muallif Antonio, manba
Siz ham xuddi shu (to'g'ri) natija hosil qilganligini tasdiqladingizmi?
qo'shib qo'ydi muallif Antonio, manba
Siz ham xuddi shu (to'g'ri) natija hosil qilganligini tasdiqladingizmi?
qo'shib qo'ydi muallif Antonio, manba
a + = a; bilan a << = 1 kodini o'zgartirish o'rniga, ishlab chiqarilgan kodda hech narsa o'zgarmaydi deb o'ylaymanmi?
qo'shib qo'ydi muallif Antonio, manba
a + = a; bilan a << = 1 kodini o'zgartirish o'rniga, ishlab chiqarilgan kodda hech narsa o'zgarmaydi deb o'ylaymanmi?
qo'shib qo'ydi muallif Antonio, manba
a + = a; bilan a << = 1 kodini o'zgartirish o'rniga, ishlab chiqarilgan kodda hech narsa o'zgarmaydi deb o'ylaymanmi?
qo'shib qo'ydi muallif Antonio, manba
@SergioFormiggini Birinchidan, a va b ni solishtirib, ularni a : b sifatida ishlatishingiz kerak bo'lsa, ularni almashtiring. eng kam döngü soni bo'lishi kerak.
qo'shib qo'ydi muallif Antonio, manba
@SergioFormiggini Birinchidan, a va b ni solishtirib, ularni a : b sifatida ishlatishingiz kerak bo'lsa, ularni almashtiring. eng kam döngü soni bo'lishi kerak.
qo'shib qo'ydi muallif Antonio, manba
Nazariy 72 tsikldan tushundim, ammo ... Menimcha, imkonsiz! Shunday qilib, 4.5 usec. Sizning savolingizga 32 yoki 48 ga cheklanganligi sababini tushuntirish uchun qo'shimcha tafsilotlarni kiritishingiz kerak deb o'ylayman. Balki, biz ikkita qo'ng'iroqda bo'linish usulini topishimiz mumkin?
qo'shib qo'ydi muallif Antonio, manba
Nazariy 72 tsikldan tushundim, ammo ... Menimcha, imkonsiz! Shunday qilib, 4.5 usec. Sizning savolingizga 32 yoki 48 ga cheklanganligi sababini tushuntirish uchun qo'shimcha tafsilotlarni kiritishingiz kerak deb o'ylayman. Balki, biz ikkita qo'ng'iroqda bo'linish usulini topishimiz mumkin?
qo'shib qo'ydi muallif Antonio, manba
Nazariy 72 tsikldan tushundim, ammo ... Menimcha, imkonsiz! Shunday qilib, 4.5 usec. Sizning savolingizga 32 yoki 48 ga cheklanganligi sababini tushuntirish uchun qo'shimcha tafsilotlarni kiritishingiz kerak deb o'ylayman. Balki, biz ikkita qo'ng'iroqda bo'linish usulini topishimiz mumkin?
qo'shib qo'ydi muallif Antonio, manba
Men "imkonsiz" deyapmanmi? :) Men 64 tsiklga o'tishim kerak, men sizga xabar beraman.
qo'shib qo'ydi muallif Antonio, manba
Men "imkonsiz" deyapmanmi? :) Men 64 tsiklga o'tishim kerak, men sizga xabar beraman.
qo'shib qo'ydi muallif Antonio, manba
Men "imkonsiz" deyapmanmi? :) Men 64 tsiklga o'tishim kerak, men sizga xabar beraman.
qo'shib qo'ydi muallif Antonio, manba
16x16 ko'paytmasi 32bit natijaga olib kelishi kerak, aks holda ularning yuqori buyurtma bitlari yo'qolishi ehtimoli. I.E. 65535 * 65535 = 4294967295, bu vakili uchun 32 bit talab qiladi
qo'shib qo'ydi muallif user3629249, manba
16x16 ko'paytmasi 32bit natijaga olib kelishi kerak, aks holda ularning yuqori buyurtma bitlari yo'qolishi ehtimoli. I.E. 65535 * 65535 = 4294967295, bu vakili uchun 32 bit talab qiladi
qo'shib qo'ydi muallif user3629249, manba
16x16 ko'paytmasi 32bit natijaga olib kelishi kerak, aks holda ularning yuqori buyurtma bitlari yo'qolishi ehtimoli. I.E. 65535 * 65535 = 4294967295, bu vakili uchun 32 bit talab qiladi
qo'shib qo'ydi muallif user3629249, manba
uint8_t mul (uint8_t, uint8_t) va uint16_t mul_wide (uint8_t, uint8_t) kodlarini yozishingiz va undan so'ng uint16_t ikkita 8-bit so'z.
qo'shib qo'ydi muallif Hurkyl, manba
uint8_t mul (uint8_t, uint8_t) va uint16_t mul_wide (uint8_t, uint8_t) kodlarini yozishingiz va undan so'ng uint16_t ikkita 8-bit so'z.
qo'shib qo'ydi muallif Hurkyl, manba
@Sergio: Ehtimol, eng katta narsa shundaki, sekin-ko'plik algoritmiga ega bo'lganingiz sababli, Karatsuba afzalliklarga ega bo'lishi mumkin. Agar bo'lmasa ham, sizning arxitekturangizning tafsilotlarini bilmayman, lekin 8-bitli qismlar bilan ishlash yanada samaraliroq bo'lishi mumkin.
qo'shib qo'ydi muallif Hurkyl, manba
@Sergio: Ehtimol, eng katta narsa shundaki, sekin-ko'plik algoritmiga ega bo'lganingiz sababli, Karatsuba afzalliklarga ega bo'lishi mumkin. Agar bo'lmasa ham, sizning arxitekturangizning tafsilotlarini bilmayman, lekin 8-bitli qismlar bilan ishlash yanada samaraliroq bo'lishi mumkin.
qo'shib qo'ydi muallif Hurkyl, manba
@Sergio: Ehtimol, eng katta narsa shundaki, sekin-ko'plik algoritmiga ega bo'lganingiz sababli, Karatsuba afzalliklarga ega bo'lishi mumkin. Agar bo'lmasa ham, sizning arxitekturangizning tafsilotlarini bilmayman, lekin 8-bitli qismlar bilan ishlash yanada samaraliroq bo'lishi mumkin.
qo'shib qo'ydi muallif Hurkyl, manba
"^" Belgisi bilan nima demoqchisiz? Men "kuch" degan ma'noni anglatadi deb o'ylayman! How to calculate a pow (x, 2) ni qanday hisoblash mumkin; agar ko'payishim bo'lmasa?
qo'shib qo'ydi muallif Sir Jo Black, manba
"^" Belgisi bilan nima demoqchisiz? Men "kuch" degan ma'noni anglatadi deb o'ylayman! How to calculate a pow (x, 2) ni qanday hisoblash mumkin; agar ko'payishim bo'lmasa?
qo'shib qo'ydi muallif Sir Jo Black, manba
"^" Belgisi bilan nima demoqchisiz? Men "kuch" degan ma'noni anglatadi deb o'ylayman! How to calculate a pow (x, 2) ni qanday hisoblash mumkin; agar ko'payishim bo'lmasa?
qo'shib qo'ydi muallif Sir Jo Black, manba
@Antonio, u ishlaydi! Men buni sinov uchun ishlataman!
qo'shib qo'ydi muallif Sir Jo Black, manba
@Antonio, u ishlaydi! Men buni sinov uchun ishlataman!
qo'shib qo'ydi muallif Sir Jo Black, manba
@Antonio, u ishlaydi! Men buni sinov uchun ishlataman!
qo'shib qo'ydi muallif Sir Jo Black, manba
Men sinab ko'rdim, lekin derleyici foydalanishni afzal ko'rdi + = a !!! :) Biroq, AVR Tiny 85 ustida, qo'shing yoki lsl xuddi shu vaqtni o'z ichiga oladi!
qo'shib qo'ydi muallif Sir Jo Black, manba
Men sinab ko'rdim, lekin derleyici foydalanishni afzal ko'rdi + = a !!! :) Biroq, AVR Tiny 85 ustida, qo'shing yoki lsl xuddi shu vaqtni o'z ichiga oladi!
qo'shib qo'ydi muallif Sir Jo Black, manba
Men sinab ko'rdim, lekin derleyici foydalanishni afzal ko'rdi + = a !!! :) Biroq, AVR Tiny 85 ustida, qo'shing yoki lsl xuddi shu vaqtni o'z ichiga oladi!
qo'shib qo'ydi muallif Sir Jo Black, manba
O'ylaymanki, yaxshi algoritm boshqa yondashuvga ega bo'lardi! Lekin men uni topolmayapman! ... Menga kengaytirilgan tam hisoblash uchun kerak! (Bundan tashqari, aqlni ishlatish kerak ...)
qo'shib qo'ydi muallif Sir Jo Black, manba
O'ylaymanki, yaxshi algoritm boshqa yondashuvga ega bo'lardi! Lekin men uni topolmayapman! ... Menga kengaytirilgan tam hisoblash uchun kerak! (Bundan tashqari, aqlni ishlatish kerak ...)
qo'shib qo'ydi muallif Sir Jo Black, manba
O'ylaymanki, yaxshi algoritm boshqa yondashuvga ega bo'lardi! Lekin men uni topolmayapman! ... Menga kengaytirilgan tam hisoblash uchun kerak! (Bundan tashqari, aqlni ishlatish kerak ...)
qo'shib qo'ydi muallif Sir Jo Black, manba
@ Xurkil. Avantajini tushunmayapman! Undan tashqari, yuk tashishni ham hisobga olishim kerak!
qo'shib qo'ydi muallif Sir Jo Black, manba
@ Xurkil. Avantajini tushunmayapman! Undan tashqari, yuk tashishni ham hisobga olishim kerak!
qo'shib qo'ydi muallif Sir Jo Black, manba
@Antonio, almashtirish yaxshi fikr edi! :)
qo'shib qo'ydi muallif Sir Jo Black, manba
@Antonio, almashtirish yaxshi fikr edi! :)
qo'shib qo'ydi muallif Sir Jo Black, manba
Almashtirish yaxshilanishini algoritm tezligini oshirishi kerakligini tekshirish kerak ... Men biroz o'ylayman, lekin umumiy hisoblash tezligini kamaytirishi mumkin bo'lgan boshqa taqqoslashni joriy qilishi mumkin.
qo'shib qo'ydi muallif Sir Jo Black, manba
Almashtirish yaxshilanishini algoritm tezligini oshirishi kerakligini tekshirish kerak ... Men biroz o'ylayman, lekin umumiy hisoblash tezligini kamaytirishi mumkin bo'lgan boshqa taqqoslashni joriy qilishi mumkin.
qo'shib qo'ydi muallif Sir Jo Black, manba
Almashtirish yaxshilanishini algoritm tezligini oshirishi kerakligini tekshirish kerak ... Men biroz o'ylayman, lekin umumiy hisoblash tezligini kamaytirishi mumkin bo'lgan boshqa taqqoslashni joriy qilishi mumkin.
qo'shib qo'ydi muallif Sir Jo Black, manba
Xurkil! Foydalanadigan MCU 8 bitli so'zlarga ega bo'lsa, u holda algoritm allaqachon 8 bitli registrlar bilan boshqariladi!
qo'shib qo'ydi muallif Sir Jo Black, manba
Xurkil! Foydalanadigan MCU 8 bitli so'zlarga ega bo'lsa, u holda algoritm allaqachon 8 bitli registrlar bilan boshqariladi!
qo'shib qo'ydi muallif Sir Jo Black, manba
Xurkil! Foydalanadigan MCU 8 bitli so'zlarga ega bo'lsa, u holda algoritm allaqachon 8 bitli registrlar bilan boshqariladi!
qo'shib qo'ydi muallif Sir Jo Black, manba
@greybeard! O'ylaymanki, tashqi qurilmalar qulay emas, chunki ularning kirish vaqti! Yuqoridagi algoritm yaxshi deb o'ylayman! :)
qo'shib qo'ydi muallif Sir Jo Black, manba
@greybeard! O'ylaymanki, tashqi qurilmalar qulay emas, chunki ularning kirish vaqti! Yuqoridagi algoritm yaxshi deb o'ylayman! :)
qo'shib qo'ydi muallif Sir Jo Black, manba
@greybeard! O'ylaymanki, tashqi qurilmalar qulay emas, chunki ularning kirish vaqti! Yuqoridagi algoritm yaxshi deb o'ylayman! :)
qo'shib qo'ydi muallif Sir Jo Black, manba
MCU o'zining ichki xotirasiga ega va u foydalanishi mumkin bo'lgan yagona avtobus I2C yoki SPI! Bundan tashqari, ichki EEPROM ham bor, lekin juda kichik va kirish RAMdan ancha past
qo'shib qo'ydi muallif Sir Jo Black, manba
MCU o'zining ichki xotirasiga ega va u foydalanishi mumkin bo'lgan yagona avtobus I2C yoki SPI! Bundan tashqari, ichki EEPROM ham bor, lekin juda kichik va kirish RAMdan ancha past
qo'shib qo'ydi muallif Sir Jo Black, manba
MCU o'zining ichki xotirasiga ega va u foydalanishi mumkin bo'lgan yagona avtobus I2C yoki SPI! Bundan tashqari, ichki EEPROM ham bor, lekin juda kichik va kirish RAMdan ancha past
qo'shib qo'ydi muallif Sir Jo Black, manba
Ushbu MCU-larga ulangan tezkor ketma-ket navlarni ishlatdim! Buni ishlatish usuli axborotni olish uchun ISR (yoki tartib-qoidalar) ni o'z ichiga olgan aloqa protokolidan foydalanishdir.
qo'shib qo'ydi muallif Sir Jo Black, manba
Ushbu MCU-larga ulangan tezkor ketma-ket navlarni ishlatdim! Buni ishlatish usuli axborotni olish uchun ISR (yoki tartib-qoidalar) ni o'z ichiga olgan aloqa protokolidan foydalanishdir.
qo'shib qo'ydi muallif Sir Jo Black, manba
Ushbu MCU-larga ulangan tezkor ketma-ket navlarni ishlatdim! Buni ishlatish usuli axborotni olish uchun ISR (yoki tartib-qoidalar) ni o'z ichiga olgan aloqa protokolidan foydalanishdir.
qo'shib qo'ydi muallif Sir Jo Black, manba
Bilaman, gunohni hisoblash va yaxshi yo'lni topish juda ko'p holatlarda qidirishdan foydalanishdir. Bu birinchi 8086-yillarda yaxshi yo'l edi!
qo'shib qo'ydi muallif Sir Jo Black, manba
Bilaman, gunohni hisoblash va yaxshi yo'lni topish juda ko'p holatlarda qidirishdan foydalanishdir. Bu birinchi 8086-yillarda yaxshi yo'l edi!
qo'shib qo'ydi muallif Sir Jo Black, manba
Bilaman, gunohni hisoblash va yaxshi yo'lni topish juda ko'p holatlarda qidirishdan foydalanishdir. Bu birinchi 8086-yillarda yaxshi yo'l edi!
qo'shib qo'ydi muallif Sir Jo Black, manba
@ user3629249. Bilaman, bu algoritmni osonlik bilan o'zgartirish mumkin. ... Qaytarilishimdan foydalanadigan versiya ham 32 bitni, lekin fikrni ko'paytirish algo-ni yaxshilashdir!
qo'shib qo'ydi muallif Sir Jo Black, manba
@ user3629249. Bilaman, bu algoritmni osonlik bilan o'zgartirish mumkin. ... Qaytarilishimdan foydalanadigan versiya ham 32 bitni, lekin fikrni ko'paytirish algo-ni yaxshilashdir!
qo'shib qo'ydi muallif Sir Jo Black, manba
Juda yaxshi! ... :)
qo'shib qo'ydi muallif Sir Jo Black, manba
Juda yaxshi! ... :)
qo'shib qo'ydi muallif Sir Jo Black, manba
Juda yaxshi! ... :)
qo'shib qo'ydi muallif Sir Jo Black, manba
Sifat uchun qanday farqlar bor? Agar bo'shliq kamroq bo'lsa, ((a + b) ^ 2 - (a-b) ^ 2)/4 ni sinab ko'ring. Operandlar haqida nima ma'lum? 32-bitli natijalar ko'proq mos keladimi?
qo'shib qo'ydi muallif greybeard, manba
Sifat uchun qanday farqlar bor? Agar bo'shliq kamroq bo'lsa, ((a + b) ^ 2 - (a-b) ^ 2)/4 ni sinab ko'ring. Operandlar haqida nima ma'lum? 32-bitli natijalar ko'proq mos keladimi?
qo'shib qo'ydi muallif greybeard, manba
Jadvalni qidirish orqali quvvatning qiymatini aniqlashingiz mumkin (har qanday funktsiya, aslida). To'liq "17 * 16" bitli jadvalni bajarish, tashqi xotiradan foydalanib, 8 * 8-> 16 bitli 8 bitli MCU bilan ishlatiladigan 9 * 16 bitli jadvalga qaraganda qimmatroq ko'rinadi. Shunga qaramay, siz 8 * 8-> 16 ni amalga oshirishingiz mumkin, lekin Karatsuba 16 * 16 uchun 18 bit qisman mahsulotsiz murakkab bo'lishi mumkin.
qo'shib qo'ydi muallif greybeard, manba
Jadvalni qidirish orqali quvvatning qiymatini aniqlashingiz mumkin (har qanday funktsiya, aslida). To'liq "17 * 16" bitli jadvalni bajarish, tashqi xotiradan foydalanib, 8 * 8-> 16 bitli 8 bitli MCU bilan ishlatiladigan 9 * 16 bitli jadvalga qaraganda qimmatroq ko'rinadi. Shunga qaramay, siz 8 * 8-> 16 ni amalga oshirishingiz mumkin, lekin Karatsuba 16 * 16 uchun 18 bit qisman mahsulotsiz murakkab bo'lishi mumkin.
qo'shib qo'ydi muallif greybeard, manba
Jadvalni qidirish orqali quvvatning qiymatini aniqlashingiz mumkin (har qanday funktsiya, aslida). To'liq "17 * 16" bitli jadvalni bajarish, tashqi xotiradan foydalanib, 8 * 8-> 16 bitli 8 bitli MCU bilan ishlatiladigan 9 * 16 bitli jadvalga qaraganda qimmatroq ko'rinadi. Shunga qaramay, siz 8 * 8-> 16 ni amalga oshirishingiz mumkin, lekin Karatsuba 16 * 16 uchun 18 bit qisman mahsulotsiz murakkab bo'lishi mumkin.
qo'shib qo'ydi muallif greybeard, manba
Mexanik qurilmaga nisbatan (ketma-ket) EEPROM/Fleshliga bog'liq.
qo'shib qo'ydi muallif greybeard, manba
Mexanik qurilmaga nisbatan (ketma-ket) EEPROM/Fleshliga bog'liq.
qo'shib qo'ydi muallif greybeard, manba
Mexanik qurilmaga nisbatan (ketma-ket) EEPROM/Fleshliga bog'liq.
qo'shib qo'ydi muallif greybeard, manba
ELEKTRİK TEXNIKASI ni bilasizmi?
qo'shib qo'ydi muallif greybeard, manba
ELEKTRİK TEXNIKASI ni bilasizmi?
qo'shib qo'ydi muallif greybeard, manba
ELEKTRİK TEXNIKASI ni bilasizmi?
qo'shib qo'ydi muallif greybeard, manba
Iltimos, izohda aytib o'tilgan vaqtni cheklashni janimning muammoning echimiga javobini qo'shing.
qo'shib qo'ydi muallif greybeard, manba
Iltimos, izohda aytib o'tilgan vaqtni cheklashni janimning muammoning echimiga javobini qo'shing.
qo'shib qo'ydi muallif greybeard, manba

11 javoblar

Xulosa

  1. a va b (original taklif) almashtirishni ko'rib chiqing
  2. Shartli otlar (yaxshi optimallash emas) oldini olishga urinish
  3. Kirish formulasini qayta tiklash (taxminan 35% daromad)
  4. Ikkilamchi siljishni olib tashlash
  5. Loopni ochish: "maqbul" montaj
  6. Kompilyatorni maqbul yig'ish uchun ishonch hosil qiling.

1. Consider swapping a and b

One improvement would be to first compare a and b, and swap them if a: you should use as b the smaller of the two, so that you have the minimum number of cycles. Note that you can avoid swapping by duplicating the code (if (a then jump to a mirrored code section), AmmoI doubt it's worth.


2. Shartli o'tishlardan qochishga urinib ko'rish (muvaffaqiyatli optimallash emas)

Sinash:

uint16_t umul16_(uint16_t a, uint16_t b)
{
    ///Here swap if necessary
    uint16_t accum=0;

    while (b) {
        accum += ((b&1) * uint16_t(0xffff)) & a; //Hopefully this multiplication is optimized away
        b>>=1;
        a+=a;
    }

    return accum;
}

Serxionning fikr-mulohazasidan kelib chiqqan holda, bu yaxshilanishlarni keltirib chiqarmadi.


3. Kirish formulasini qayta ishlash

Maqsad me'morchiligi asosan 8bit yo'riqnomaga ega ekanligini hisobga olsak, kirish va o'zgaruvchining yuqori va quyi 8 bitini ajratsangiz, quyidagilarni yozishingiz mumkin:

a = a1 * 0xff + a0;
b = b1 * 0xff + b0;

a * b = a1 * b1 * 0xffff + a0 * b1 * 0xff + a1 * b0 * 0xff + a0 * b0

Endi esa, 0xffff sizning registringizdan ga yuborishi sababli, a1 * b1 * 0xffff so'zini tashlay olamiz.

(16bit) a * b = a0 * b1 * 0xff + a1 * b0 * 0xff + a0 * b0

Bundan tashqari, a0 * b1 va a1 * b0 atamasi 0xff tufayli 8bitli ko'paytma sifatida ko'rib chiqilishi mumkin: 256 registrdan chiqarilgan .

So far exciting! ... But, here comes reality striking: a0 * b0 has to be treated as a 16 bit multiplication, as you'll have to keep all resulting bits. a0 will have to be kept on 16 bit to allow shift lefts. This multiplication has half of the iterations of a * b, it is in part 8bit (because of b0) Ammoyou still have to take into account the 2 8bit multiplications mentioned before, and the final result composition. We need further reshaping!

Shuning uchun endi b0 ni yig'aman.

(16bit) a * b = a0 * b1 * 0xff + b0 * (a0 + a1 * 0xff)

Ammo

(a0 + a1 * 0xff) = a

Shunday qilib:

(16bit) a * b = a0 * b1 * 0xff + b0 * a

N = asl nusxada a * b ning aylanishlari bo'lsa, hozirda birinchi atama N/2 aylanishi bilan 8bitli ko'paytma, ikkinchisi N/2 aylanishi bilan 16bit * 8bitlik ko'paytirish. M kodi a * b ga o'zgartirilganda ko'rsatmalarning sonini hisobga olgan holda, 8bit * 8bit iteratsiya yo'riqnomaning yarmiga ega, va taxminan 80% dan 16bit * 8bit (bir o'zgarish buyrug'i b0 b) ga nisbatan. Birgalikda N/2 * M/2 + N/2 * M * 0.8 = N * M * 0.65 murakkabligi bor, shuning uchun <35% original N * M ga nisbatan hurmat ko'rsatiladi. promising so'zlari tovushlar.

Bu kod:

uint16_t umul16_(uint16_t a, uint16_t b)
{
    uint8_t res1 = 0;

    uint8_t a0 = a & 0xff; //This effectively needs to copy the data
    uint8_t b0 = b & 0xff; //This should be optimized away
    uint8_t b1 = b >>8; //This should be optimized away

    //Here a0 and b1 could be swapped (to have b1 < a0)
    while (b1) {///Maximum 8 cycles
        if ( (b1 & 1) )
            res1+=a0;
        b1>>=1;
        a0+=a0;
    }

    uint16_t res = (uint16_t) res1 * 256; //Should be optimized away, it's not even a copy!

    //Here swapping wouldn't make much sense
    while (b0) {///Maximum 8 cycles
        if ( (b0 & 1) )
            res+=a;
        b0>>=1;
        a+=a;
    }

    return res;
}

Bundan tashqari, 2 tsikldagi bo'linish, nazariy jihatdan, bir nechta davrlarni o'tkazish imkoniyatini ikki baravar oshirish kerak: N/2 engil ortiqcha bo'lishi mumkin.

a o'zgaruvchilari uchun keraksiz, keyingi kaymalardan saqlanish uchun kichikroq qo'shimcha yaxshilanish. Kichik yon eslatma: agar b0 yoki b1 nol bo'lsa, u holda qo'shimcha 2 ta ko'rsatma bo'ladi. Ammo , shuningdek, eng qimmat bo'lgan b0 va b1 ning birinchi tekshiruvini saqlab qoladi, chunki u nolinchi bayroq holati uchun shartli o'tish uchun o'tish ishidan tekshirolmaydi. pastadir.

uint16_t umul16_(uint16_t a, uint16_t b)
{
    uint8_t res1 = 0;

    uint8_t a0 = a & 0xff; //This effectively needs to copy the data
    uint8_t b0 = b & 0xff; //This should be optimized away
    uint8_t b1 = b >>8; //This should be optimized away

    //Here a0 and b1 could be swapped (to have b1 < a0)
    if ( (b1 & 1) )
        res1+=a0;
    b1>>=1;
    while (b1) {///Maximum 7 cycles
        a0+=a0;
        if ( (b1 & 1) )
            res1+=a0;
        b1>>=1;
    }

    uint16_t res = (uint16_t) res1 * 256; //Should be optimized away, it's not even a copy!

    //Here swapping wouldn't make much sense
    if ( (b0 & 1) )
        res+=a;
    b0>>=1;
    while (b0) {///Maximum 7 cycles
        a+=a;
        if ( (b0 & 1) )
            res+=a;
        b0>>=1;
    }

    return res;
}

4. Ikkilamchi shovqinni olib tashlash

Rivojlanish uchun bo'sh joy bormi? Ha , a0 da baytlar ikki marta </​​em> kaydırılır. Shunday qilib, ikkita ildizni birlashtirishda foyda bo'lishi kerak. Compilerni, biz xohlagan narsani qilish uchun, ayniqsa natija qaydnomasi bilan bajarish uchun ishontirish uchun biroz qiyin bo'lishi mumkin.

Shunday qilib, biz bir xil aylanish jarayonida b0 va b1 . Dastlabki narsa, loop chiqish holati qanday? Hozircha b0 / b1 yordamida vaziyatni o'chirib qo'yish foydalidir, chunki u taymerdan foydalanishni taqiqlaydi. Bundan tashqari, o'ngga siljishdan so'ng, operatsiya natijasi nolga teng bo'lsa, bayroq belgilanishi mumkin va bu bayroq qo'shimcha baholashsiz shartli o'tishga ruxsat berishi mumkin.

Now the loop exit condition could be the failure of (b0 || b1). However this could require expensive computation. One solution is to compare b0 and b1 and jump to 2 different code sections: if b1 > b0 I test the condition on b1, else I test the condition on b0. I prefer another solution, with 2 loops, the first exit when b0 is zero, the second when b1 is zero. There will be cases in which I will do zero iterations in b1. The point is that in the second loop I know b0 is zero, so I can reduce the number of operations performed.

Keling, exit holatini unutamiz va avvalgi bo'limning 2 ta qismiga qo'shilishga harakat qilaylik.

uint16_t umul16_(uint16_t a, uint16_t b)
{
    uint16_t res = 0;

    uint8_t b0 = b & 0xff; //This should be optimized away
    uint8_t b1 = b >>8; //This should be optimized away

    //Swapping probably doesn't make much sense anymore
    if ( (b1 & 1) )
        res+=(uint16_t)((uint8_t)(a && 0xff))*256;
    //Hopefully the compiler understands it has simply to add the low 8bit register of a to the high 8bit register of res

    if ( (b0 & 1) )
        res+=a;

    b1>>=1;
    b0>>=1;
    while (b0) {///N cycles, maximum 7
        a+=a;
        if ( (b1 & 1) )
            res+=(uint16_t)((uint8_t)(a & 0xff))*256;
        if ( (b0 & 1) )
            res+=a;
        b1>>=1;
        b0>>=1; //I try to put as last the one that will leave the carry flag in the desired state
    }

    uint8_t a0 = a & 0xff; //Again, not a real copy Ammoa register selection

    while (b1) {///P cycles, maximum 7 - N cycles
        a0+=a0;
        if ( (b1 & 1) )
            res+=(uint16_t) a0 * 256;
        b1>>=1;
    }
    return res;
}

Thanks Sergio for providing the assembly generated (-Ofast). At first glance, considering the outrageous amount of mov in the code, it seems the compiler did not interpret as I wanted the hints I gave to him to interpret the registers.

Inputs are: r22,r23 and r24,25.
AVR Instruction Set: Quick reference, Detailed documentation

sbrs //Tests a single bit in a register and skips the next instruction if the bit is set. Skip takes 2 clocks. 
ldi//Load immediate, 1 clock
sbiw//Subtracts immediate to *word*, 2 clocks

    00000010 :
      10:    70 ff           sbrs    r23, 0
      12:    39 c0           rjmp    .+114        ; 0x86 <__SREG__+0x47>
      14:    41 e0           ldi    r20, 0x01    ; 1
      16:    00 97           sbiw    r24, 0x00    ; 0
      18:    c9 f1           breq    .+114        ; 0x8c <__SREG__+0x4d>
      1a:    34 2f           mov    r19, r20
      1c:    20 e0           ldi    r18, 0x00    ; 0
      1e:    60 ff           sbrs    r22, 0
      20:    07 c0           rjmp    .+14         ; 0x30 
      22:    28 0f           add    r18, r24
      24:    39 1f           adc    r19, r25
      26:    04 c0           rjmp    .+8          ; 0x30 
      28:    e4 2f           mov    r30, r20
      2a:    45 2f           mov    r20, r21
      2c:    2e 2f           mov    r18, r30
      2e:    34 2f           mov    r19, r20
      30:    76 95           lsr    r23
      32:    66 95           lsr    r22
      34:    b9 f0           breq    .+46         ; 0x64 <__SREG__+0x25>
      36:    88 0f           add    r24, r24
      38:    99 1f           adc    r25, r25
      3a:    58 2f           mov    r21, r24
      3c:    44 27           eor    r20, r20
      3e:    42 0f           add    r20, r18
      40:    53 1f           adc    r21, r19
      42:    70 ff           sbrs    r23, 0
      44:    02 c0           rjmp    .+4          ; 0x4a <__SREG__+0xb>
      46:    24 2f           mov    r18, r20
      48:    35 2f           mov    r19, r21
      4a:    42 2f           mov    r20, r18
      4c:    53 2f           mov    r21, r19
      4e:    48 0f           add    r20, r24
      50:    59 1f           adc    r21, r25
      52:    60 fd           sbrc    r22, 0
      54:    e9 cf           rjmp    .-46         ; 0x28 
      56:    e2 2f           mov    r30, r18
      58:    43 2f           mov    r20, r19
      5a:    e8 cf           rjmp    .-48         ; 0x2c 
      5c:    95 2f           mov    r25, r21
      5e:    24 2f           mov    r18, r20
      60:    39 2f           mov    r19, r25
      62:    76 95           lsr    r23
      64:    77 23           and    r23, r23
      66:    61 f0           breq    .+24         ; 0x80 <__SREG__+0x41>
      68:    88 0f           add    r24, r24
      6a:    48 2f           mov    r20, r24
      6c:    50 e0           ldi    r21, 0x00    ; 0
      6e:    54 2f           mov    r21, r20
      70:    44 27           eor    r20, r20
      72:    42 0f           add    r20, r18
      74:    53 1f           adc    r21, r19
      76:    70 fd           sbrc    r23, 0
      78:    f1 cf           rjmp    .-30         ; 0x5c <__SREG__+0x1d>
      7a:    42 2f           mov    r20, r18
      7c:    93 2f           mov    r25, r19
      7e:    ef cf           rjmp    .-34         ; 0x5e <__SREG__+0x1f>
      80:    82 2f           mov    r24, r18
      82:    93 2f           mov    r25, r19
      84:    08 95           ret
      86:    20 e0           ldi    r18, 0x00    ; 0
      88:    30 e0           ldi    r19, 0x00    ; 0
      8a:    c9 cf           rjmp    .-110        ; 0x1e 
      8c:    40 e0           ldi    r20, 0x00    ; 0
      8e:    c5 cf           rjmp    .-118        ; 0x1a 

5. Loopni ochish: "maqbul" montaj

Ushbu ma'lumotlarning barchasida, keling, me'moriy cheklovlarni hisobga olgan holda, "maqbul" echim qanday bo'lishini tushunishga harakat qilaylik. "Optimal" so'zi keltirilgan, chunki "maqbul" narsa kirish ma'lumotlariga va optimallashtirishni xohlayotgan narsalarga bog'liqdir. Eng yomon holatlarda davriy sonlar sonini optimallashtirishni xohlaymiz deb taxmin qilamiz. . Agar biz eng yomon vaziyatga boradigan bo'lsak, loop unrolling o'rtacha tanlovdir: 8 tsikl borligini bilamiz va biz tugalladik (agar b0 va b1 nol bo'lsa). Hozircha biz "pastga siljiymiz va biz nol bayroqni tekshiramiz" deb ishlatdik. Ushbu talabni bekor qilsak, boshqa hiylalarni ishlatishimiz mumkin: biz o`zimizni o`zgartiramiz va natijani yangilash kerakmi yoki yo'qligini bilish uchun uskuna bitini tekshiramiz . Buyruqlar to'plamini hisobga olgan holda, "tushuntirish" kodi yordamida ko'rsatmalar quyidagi holga aylanadi.

//Input: a = a1 * 256 + a0, b = b1 * 256 + b0
//Output: r = r1 * 256 + r0

Preliminary:
P0 r0 = 0 (CLR)
P1 r1 = 0 (CLR)

Main block:
0 Shift right b0 (LSR)
1 If carry is not set skip 2 instructions = jump to 4 (BRCC)
2 r0 = r0 + a0 (ADD)
3 r1 = r1 + a1 + carry from prev. (ADC)
4 Shift right b1 (LSR)
5 If carry is not set skip 1 instruction = jump to 7 (BRCC)
6 r1 = r1 + a0 (ADD)
7 a0 = a0 + a0 (ADD)  
8 a1 = a1 + a1 + carry from prev. (ADC)

[Repeat same instructions for another 7 times]

Filialda hech qanday sakrash bo'lmasa, 1 ta ko'rsatma talab etiladi. Boshqa barcha ko'rsatmalar 1 ta sikl. Shunday qilib, b1 holati cycler soniga ta'sir qilmaydi, b0 = 1 bo'lsa, bizda 9 tsikl bor va b0 = 0 bo'lsa, 8 tsikl bor. Initializatsiyani hisoblash, 8 yineleme va a0 va a1 ning oxirgi yangilanishini atash, (b0 = 11111111b), bizda 8 * 9 + 2 - 2 = 72 tsiklli mavjud. Qaysi C ++ dasturini ishlab chiqaruvchi derazani ishlab chiqarishga ishontirishini bilmasdim. Balki:

 void iterate(uint8_t& b0,uint8_t& b1,uint16_t& a, uint16_t& r) {
     const uint8_t temp0 = b0;
     b0 >>=1;
     if (temp0 & 0x01) {//Will this convince him to use the carry flag?
         r += a;
     }
     const uint8_t temp1 = b1;
     b1 >>=1;
     if (temp1 & 0x01) {
         r+=(uint16_t)((uint8_t)(a & 0xff))*256;
     }
     a += a;
 }

 uint16_t umul16_(uint16_t a, uint16_t b) {
     uint16_t r = 0;
     uint8_t b0 = b & 0xff;
     uint8_t b1 = b >>8;

     iterate(b0,b1,a,r);
     iterate(b0,b1,a,r);
     iterate(b0,b1,a,r);
     iterate(b0,b1,a,r);
     iterate(b0,b1,a,r);
     iterate(b0,b1,a,r);
     iterate(b0,b1,a,r);
     iterate(b0,b1,a,r); //Hopefully he understands he doesn't need the last update for variable a
     return r;
 }

Ammo, oldingi natijaga ko'ra, kerakli kodni olish uchun chindan ham montajga o'tish kerak!


Nihoyat, aylanish jarayonini yanada chuqurroq talqin qilish ham mumkin: sbrc/sbrs yo'riqnomalari ma'lum bir bitikni tekshirishga imkon beradi. Shuning uchun biz b0 va b1-ni almashtirishdan qochishimiz mumkin, va har bir tsikldagi boshqa bitni tekshirib ko'rishimiz mumkin. Muammo shundaki, ushbu yo'riqnomalar faqat maxsus o'tish uchun emas, balki keyingi ko'rsatmaga o'tishga imkon beradi. Shunday qilib, "tushuntirish kodi" da shunday bo'ladi:

Main block:
0 Test Nth bit of b0 (SBRS). If set jump to 2 (+ 1cycle) otherwise continue with 1
1 Jump to 4 (RJMP)
2 r0 = r0 + a0 (ADD)
3 r1 = r1 + a1 + carry from prev. (ADC)
4 Test Nth bit of (SBRC). If cleared jump to 6 (+ 1cycle) otherwise continue with 5
5 r1 = r1 + a0 (ADD)
6 a0 = a0 + a0 (ADD)  
7 a1 = a1 + a1 + carry from prev. (ADC)

Ikkinchi o'zgarish 1 ta siklni saqlab qolishga imkon beradi, ikkinchidan ikkinchisida esa aniq ustunlik yo'q. Biroq, menimcha, C ++ kodi derleyici uchun sharhni osonroq bo'lishi mumkin. A0 va a1 ning so'nggi yangilanishlarini ishga tushirish va ishga tushirish 8 tsiklni hisobga olgan holda, endi biz 64 döngü ga egamiz.

C ++ kodi:

 template
 void iterateWithMask(const uint8_t& b0,const uint8_t& b1, uint16_t& a, uint16_t& r) {
     if (b0 & mask)
         r += a;
     if (b1 & mask)
         r+=(uint16_t)((uint8_t)(a & 0xff))*256;
     a += a;
 }

 uint16_t umul16_(uint16_t a, const uint16_t b) {
     uint16_t r = 0;
     const uint8_t b0 = b & 0xff;
     const uint8_t b1 = b >>8;

     iterateWithMask<0x01>(b0,b1,a,r);
     iterateWithMask<0x02>(b0,b1,a,r);
     iterateWithMask<0x04>(b0,b1,a,r);
     iterateWithMask<0x08>(b0,b1,a,r);
     iterateWithMask<0x10>(b0,b1,a,r);
     iterateWithMask<0x20>(b0,b1,a,r);
     iterateWithMask<0x40>(b0,b1,a,r);
     iterateWithMask<0x80>(b0,b1,a,r);

     //Hopefully he understands he doesn't need the last update for a
     return r;
 }

Note that in this implementation the 0x01, 0x02 are not a real value, Ammojust a hint to the compiler to know which bit to test. Therefore, the mask cannot be obtained by shifting right: differently from all other functions seen so far, this has really no equivalent loop version.

Bu katta muammo

r+=(uint16_t)((uint8_t)(a & 0xff))*256;

r pastki registri bilan a pastki registri bilan yuqori reestrning yig'indisi bo'lishi kerak. Istaganim kabi talqin qilinmaydi. Boshqa variant:

r+=(uint16_t) 256 *((uint8_t)(a & 0xff));

6. Kompilyatorni maqbul yig'ish uchun ishonch hosil qilish

We can also keep a constant, and shift instead the result r. In this case we process b starting from the most significant bit. The complexity is equivalent, Ammoit might be easier for the compiler to digest. Also, this time we have to be careful to write explicitly the last loop, which must not do a further shift right for r.

 template
 void inverseIterateWithMask(const uint8_t& b0,const uint8_t& b1,const uint16_t& a, const uint8_t& a0, uint16_t& r) {
     if (b0 & mask)
         r += a;
     if (b1 & mask)
         r+=(uint16_t)256*a0; //Hopefully easier to understand for the compiler?
     r += r;
 }

 uint16_t umul16_(const uint16_t a, const uint16_t b) {
     uint16_t r = 0;
     const uint8_t b0 = b & 0xff;
     const uint8_t b1 = b >>8;
     const uint8_t a0 = a & 0xff;

     inverseIterateWithMask<0x80>(b0,b1,a,r);
     inverseIterateWithMask<0x40>(b0,b1,a,r);
     inverseIterateWithMask<0x20>(b0,b1,a,r);
     inverseIterateWithMask<0x10>(b0,b1,a,r);
     inverseIterateWithMask<0x08>(b0,b1,a,r);
     inverseIterateWithMask<0x04>(b0,b1,a,r);
     inverseIterateWithMask<0x02>(b0,b1,a,r);

     //Last iteration:
     if (b0 & 0x01)
         r += a;
     if (b1 & 0x01)
         r+=(uint16_t)256*a0;

     return r;
 }
17
qo'shib qo'ydi
@Flexo Men so'nggi qismni qo'shib oldingi tahrirlarni biroz o'zgartirdim. Keyinchalik yaxshilanish uchun joyni ko'rsangiz, menga xabar bering.
qo'shib qo'ydi muallif Antonio, manba
@SergioFormiggini Menimcha, bu yerda topilgan narsalar - juda yaxshi. Mening javobim ushbu versiyasi ga qarang: Siz taklifingizdan nusxa ko'chirishga ega bo'lganman. Keyin esa - versiyasini olib tashladim. Shu bois, bu erda nimalar samarali ekanligiga ishonaman. Men javobimni ajratish haqida o'ylardim, lekin uni indeks bilan saqlashni afzal ko'raman, aks holda u faqatgina yig'ishni istayman (= to'plash) obro'siga o'xshash bo'ladi, men izlayotgan narsa esa mukammal va to'liq javobdir :).
qo'shib qo'ydi muallif Antonio, manba
@SergioFormiggini Yuqori darajali kod (yoki pastroq bo'lsa-da, pastroq bo'lsa-da, pastroq bo'lsa-da), yoki tezkor kodni (bu holda loopni ochish usullaridan biridir) aniqlay olasizmi?
qo'shib qo'ydi muallif Antonio, manba
qo'shib qo'ydi muallif Antonio, manba
@greybeard Bundan tashqari sbrc dan oldin yana lsr kerak. Izoh: bcc bu holda brcc .
qo'shib qo'ydi muallif Antonio, manba
Javobingizni "tahrirlash uchun sarlavha" va "tahrirlash" ni yozmasdan yoza olasizmi? O'qishni juda yaxshi biladi, uni birovni noldan o'qish uchun ishlaydigan va boshqalarni tahrir qilish tarixini/sharhini kerakli tarzda sharhlab berishga imkon beradigan hikoya bilan yozishni juda yaxshi biladi. (Ehtimol, matnli kitobdagi bobni tayyorlash kabi).
qo'shib qo'ydi muallif Flexo, manba
@Antonio kodi bilan emas, balki-emas. Tahrir qilishga harakat qildim, lekin o'zgartirish juda kichik! :)
qo'shib qo'ydi muallif Sir Jo Black, manba
@Antonio, sizning javobingiz ko'proq javob bo'lishi mumkin deb o'ylayman, shuning uchun o'qiganlarning barchasi turli xil algoritmlarni va aniq javobda tegishli sharhlarni ko'rishlari mumkin!
qo'shib qo'ydi muallif Sir Jo Black, manba
Ha, men tezkor kodni talab qilaman. Kompilyatsiya qilmoqchi bo'lgan kodni kompilyatsiya qilish kerak - chunki AVR xotirasi faqat 8 Kb Flash. Kodni mening muhitimda tuzdim. Endi men urinib ko'rmoqchi edim-ku, lekin menimcha, bu farq minimal bo'ladi! Men faqat ushbu kodni va "ISR" ni yozmoqchi deb o'ylayman.
qo'shib qo'ydi muallif Sir Jo Black, manba
@Antonio, mening muammom ko'payishning tezkor usuliga ega bo'lish edi, balki kichik kodni ishlab chiqarishdir (men sizning savolingizga javob sifatida o'qib chiqsangiz)! Men aytganimdek, uni tarqatadigan ko'paytirgichlarni hal qildim, lekin ko'paytirish algoritmi haqidagi munozarani to'xtatib qo'ydim va keyin uni sinovdan o'tkazaman!
qo'shib qo'ydi muallif Sir Jo Black, manba
lsr yashirinligini saqlash nimani anglatadi adc sbrc qo'shing lsl roli ?
qo'shib qo'ydi muallif greybeard, manba

Xulosa

  1. a va b (original taklif) almashtirishni ko'rib chiqing
  2. Shartli otlar (yaxshi optimallash emas) oldini olishga urinish
  3. Kirish formulasini qayta tiklash (taxminan 35% daromad)
  4. Ikkilamchi siljishni olib tashlash
  5. Loopni ochish: "maqbul" montaj
  6. Kompilyatorni maqbul yig'ish uchun ishonch hosil qiling.

1. Consider swapping a and b

One improvement would be to first compare a and b, and swap them if a: you should use as b the smaller of the two, so that you have the minimum number of cycles. Note that you can avoid swapping by duplicating the code (if (a then jump to a mirrored code section), AmmoI doubt it's worth.


2. Shartli o'tishlardan qochishga urinib ko'rish (muvaffaqiyatli optimallash emas)

Sinash:

uint16_t umul16_(uint16_t a, uint16_t b)
{
    ///Here swap if necessary
    uint16_t accum=0;

    while (b) {
        accum += ((b&1) * uint16_t(0xffff)) & a; //Hopefully this multiplication is optimized away
        b>>=1;
        a+=a;
    }

    return accum;
}

Serxionning fikr-mulohazasidan kelib chiqqan holda, bu yaxshilanishlarni keltirib chiqarmadi.


3. Kirish formulasini qayta ishlash

Maqsad me'morchiligi asosan 8bit yo'riqnomaga ega ekanligini hisobga olsak, kirish va o'zgaruvchining yuqori va quyi 8 bitini ajratsangiz, quyidagilarni yozishingiz mumkin:

a = a1 * 0xff + a0;
b = b1 * 0xff + b0;

a * b = a1 * b1 * 0xffff + a0 * b1 * 0xff + a1 * b0 * 0xff + a0 * b0

Endi esa, 0xffff sizning registringizdan ga yuborishi sababli, a1 * b1 * 0xffff so'zini tashlay olamiz.

(16bit) a * b = a0 * b1 * 0xff + a1 * b0 * 0xff + a0 * b0

Bundan tashqari, a0 * b1 va a1 * b0 atamasi 0xff tufayli 8bitli ko'paytma sifatida ko'rib chiqilishi mumkin: 256 registrdan chiqarilgan .

So far exciting! ... But, here comes reality striking: a0 * b0 has to be treated as a 16 bit multiplication, as you'll have to keep all resulting bits. a0 will have to be kept on 16 bit to allow shift lefts. This multiplication has half of the iterations of a * b, it is in part 8bit (because of b0) Ammoyou still have to take into account the 2 8bit multiplications mentioned before, and the final result composition. We need further reshaping!

Shuning uchun endi b0 ni yig'aman.

(16bit) a * b = a0 * b1 * 0xff + b0 * (a0 + a1 * 0xff)

Ammo

(a0 + a1 * 0xff) = a

Shunday qilib:

(16bit) a * b = a0 * b1 * 0xff + b0 * a

N = asl nusxada a * b ning aylanishlari bo'lsa, hozirda birinchi atama N/2 aylanishi bilan 8bitli ko'paytma, ikkinchisi N/2 aylanishi bilan 16bit * 8bitlik ko'paytirish. M kodi a * b ga o'zgartirilganda ko'rsatmalarning sonini hisobga olgan holda, 8bit * 8bit iteratsiya yo'riqnomaning yarmiga ega, va taxminan 80% dan 16bit * 8bit (bir o'zgarish buyrug'i b0 b) ga nisbatan. Birgalikda N/2 * M/2 + N/2 * M * 0.8 = N * M * 0.65 murakkabligi bor, shuning uchun <35% original N * M ga nisbatan hurmat ko'rsatiladi. promising so'zlari tovushlar.

Bu kod:

uint16_t umul16_(uint16_t a, uint16_t b)
{
    uint8_t res1 = 0;

    uint8_t a0 = a & 0xff; //This effectively needs to copy the data
    uint8_t b0 = b & 0xff; //This should be optimized away
    uint8_t b1 = b >>8; //This should be optimized away

    //Here a0 and b1 could be swapped (to have b1 < a0)
    while (b1) {///Maximum 8 cycles
        if ( (b1 & 1) )
            res1+=a0;
        b1>>=1;
        a0+=a0;
    }

    uint16_t res = (uint16_t) res1 * 256; //Should be optimized away, it's not even a copy!

    //Here swapping wouldn't make much sense
    while (b0) {///Maximum 8 cycles
        if ( (b0 & 1) )
            res+=a;
        b0>>=1;
        a+=a;
    }

    return res;
}

Bundan tashqari, 2 tsikldagi bo'linish, nazariy jihatdan, bir nechta davrlarni o'tkazish imkoniyatini ikki baravar oshirish kerak: N/2 engil ortiqcha bo'lishi mumkin.

a o'zgaruvchilari uchun keraksiz, keyingi kaymalardan saqlanish uchun kichikroq qo'shimcha yaxshilanish. Kichik yon eslatma: agar b0 yoki b1 nol bo'lsa, u holda qo'shimcha 2 ta ko'rsatma bo'ladi. Ammo , shuningdek, eng qimmat bo'lgan b0 va b1 ning birinchi tekshiruvini saqlab qoladi, chunki u nolinchi bayroq holati uchun shartli o'tish uchun o'tish ishidan tekshirolmaydi. pastadir.

uint16_t umul16_(uint16_t a, uint16_t b)
{
    uint8_t res1 = 0;

    uint8_t a0 = a & 0xff; //This effectively needs to copy the data
    uint8_t b0 = b & 0xff; //This should be optimized away
    uint8_t b1 = b >>8; //This should be optimized away

    //Here a0 and b1 could be swapped (to have b1 < a0)
    if ( (b1 & 1) )
        res1+=a0;
    b1>>=1;
    while (b1) {///Maximum 7 cycles
        a0+=a0;
        if ( (b1 & 1) )
            res1+=a0;
        b1>>=1;
    }

    uint16_t res = (uint16_t) res1 * 256; //Should be optimized away, it's not even a copy!

    //Here swapping wouldn't make much sense
    if ( (b0 & 1) )
        res+=a;
    b0>>=1;
    while (b0) {///Maximum 7 cycles
        a+=a;
        if ( (b0 & 1) )
            res+=a;
        b0>>=1;
    }

    return res;
}

4. Ikkilamchi shovqinni olib tashlash

Rivojlanish uchun bo'sh joy bormi? Ha , a0 da baytlar ikki marta </​​em> kaydırılır. Shunday qilib, ikkita ildizni birlashtirishda foyda bo'lishi kerak. Compilerni, biz xohlagan narsani qilish uchun, ayniqsa natija qaydnomasi bilan bajarish uchun ishontirish uchun biroz qiyin bo'lishi mumkin.

Shunday qilib, biz bir xil aylanish jarayonida b0 va b1 . Dastlabki narsa, loop chiqish holati qanday? Hozircha b0 / b1 yordamida vaziyatni o'chirib qo'yish foydalidir, chunki u taymerdan foydalanishni taqiqlaydi. Bundan tashqari, o'ngga siljishdan so'ng, operatsiya natijasi nolga teng bo'lsa, bayroq belgilanishi mumkin va bu bayroq qo'shimcha baholashsiz shartli o'tishga ruxsat berishi mumkin.

Now the loop exit condition could be the failure of (b0 || b1). However this could require expensive computation. One solution is to compare b0 and b1 and jump to 2 different code sections: if b1 > b0 I test the condition on b1, else I test the condition on b0. I prefer another solution, with 2 loops, the first exit when b0 is zero, the second when b1 is zero. There will be cases in which I will do zero iterations in b1. The point is that in the second loop I know b0 is zero, so I can reduce the number of operations performed.

Keling, exit holatini unutamiz va avvalgi bo'limning 2 ta qismiga qo'shilishga harakat qilaylik.

uint16_t umul16_(uint16_t a, uint16_t b)
{
    uint16_t res = 0;

    uint8_t b0 = b & 0xff; //This should be optimized away
    uint8_t b1 = b >>8; //This should be optimized away

    //Swapping probably doesn't make much sense anymore
    if ( (b1 & 1) )
        res+=(uint16_t)((uint8_t)(a && 0xff))*256;
    //Hopefully the compiler understands it has simply to add the low 8bit register of a to the high 8bit register of res

    if ( (b0 & 1) )
        res+=a;

    b1>>=1;
    b0>>=1;
    while (b0) {///N cycles, maximum 7
        a+=a;
        if ( (b1 & 1) )
            res+=(uint16_t)((uint8_t)(a & 0xff))*256;
        if ( (b0 & 1) )
            res+=a;
        b1>>=1;
        b0>>=1; //I try to put as last the one that will leave the carry flag in the desired state
    }

    uint8_t a0 = a & 0xff; //Again, not a real copy Ammoa register selection

    while (b1) {///P cycles, maximum 7 - N cycles
        a0+=a0;
        if ( (b1 & 1) )
            res+=(uint16_t) a0 * 256;
        b1>>=1;
    }
    return res;
}

Thanks Sergio for providing the assembly generated (-Ofast). At first glance, considering the outrageous amount of mov in the code, it seems the compiler did not interpret as I wanted the hints I gave to him to interpret the registers.

Inputs are: r22,r23 and r24,25.
AVR Instruction Set: Quick reference, Detailed documentation

sbrs //Tests a single bit in a register and skips the next instruction if the bit is set. Skip takes 2 clocks. 
ldi//Load immediate, 1 clock
sbiw//Subtracts immediate to *word*, 2 clocks

    00000010 :
      10:    70 ff           sbrs    r23, 0
      12:    39 c0           rjmp    .+114        ; 0x86 <__SREG__+0x47>
      14:    41 e0           ldi    r20, 0x01    ; 1
      16:    00 97           sbiw    r24, 0x00    ; 0
      18:    c9 f1           breq    .+114        ; 0x8c <__SREG__+0x4d>
      1a:    34 2f           mov    r19, r20
      1c:    20 e0           ldi    r18, 0x00    ; 0
      1e:    60 ff           sbrs    r22, 0
      20:    07 c0           rjmp    .+14         ; 0x30 
      22:    28 0f           add    r18, r24
      24:    39 1f           adc    r19, r25
      26:    04 c0           rjmp    .+8          ; 0x30 
      28:    e4 2f           mov    r30, r20
      2a:    45 2f           mov    r20, r21
      2c:    2e 2f           mov    r18, r30
      2e:    34 2f           mov    r19, r20
      30:    76 95           lsr    r23
      32:    66 95           lsr    r22
      34:    b9 f0           breq    .+46         ; 0x64 <__SREG__+0x25>
      36:    88 0f           add    r24, r24
      38:    99 1f           adc    r25, r25
      3a:    58 2f           mov    r21, r24
      3c:    44 27           eor    r20, r20
      3e:    42 0f           add    r20, r18
      40:    53 1f           adc    r21, r19
      42:    70 ff           sbrs    r23, 0
      44:    02 c0           rjmp    .+4          ; 0x4a <__SREG__+0xb>
      46:    24 2f           mov    r18, r20
      48:    35 2f           mov    r19, r21
      4a:    42 2f           mov    r20, r18
      4c:    53 2f           mov    r21, r19
      4e:    48 0f           add    r20, r24
      50:    59 1f           adc    r21, r25
      52:    60 fd           sbrc    r22, 0
      54:    e9 cf           rjmp    .-46         ; 0x28 
      56:    e2 2f           mov    r30, r18
      58:    43 2f           mov    r20, r19
      5a:    e8 cf           rjmp    .-48         ; 0x2c 
      5c:    95 2f           mov    r25, r21
      5e:    24 2f           mov    r18, r20
      60:    39 2f           mov    r19, r25
      62:    76 95           lsr    r23
      64:    77 23           and    r23, r23
      66:    61 f0           breq    .+24         ; 0x80 <__SREG__+0x41>
      68:    88 0f           add    r24, r24
      6a:    48 2f           mov    r20, r24
      6c:    50 e0           ldi    r21, 0x00    ; 0
      6e:    54 2f           mov    r21, r20
      70:    44 27           eor    r20, r20
      72:    42 0f           add    r20, r18
      74:    53 1f           adc    r21, r19
      76:    70 fd           sbrc    r23, 0
      78:    f1 cf           rjmp    .-30         ; 0x5c <__SREG__+0x1d>
      7a:    42 2f           mov    r20, r18
      7c:    93 2f           mov    r25, r19
      7e:    ef cf           rjmp    .-34         ; 0x5e <__SREG__+0x1f>
      80:    82 2f           mov    r24, r18
      82:    93 2f           mov    r25, r19
      84:    08 95           ret
      86:    20 e0           ldi    r18, 0x00    ; 0
      88:    30 e0           ldi    r19, 0x00    ; 0
      8a:    c9 cf           rjmp    .-110        ; 0x1e 
      8c:    40 e0           ldi    r20, 0x00    ; 0
      8e:    c5 cf           rjmp    .-118        ; 0x1a 

5. Loopni ochish: "maqbul" montaj

Ushbu ma'lumotlarning barchasida, keling, me'moriy cheklovlarni hisobga olgan holda, "maqbul" echim qanday bo'lishini tushunishga harakat qilaylik. "Optimal" so'zi keltirilgan, chunki "maqbul" narsa kirish ma'lumotlariga va optimallashtirishni xohlayotgan narsalarga bog'liqdir. Eng yomon holatlarda davriy sonlar sonini optimallashtirishni xohlaymiz deb taxmin qilamiz. . Agar biz eng yomon vaziyatga boradigan bo'lsak, loop unrolling o'rtacha tanlovdir: 8 tsikl borligini bilamiz va biz tugalladik (agar b0 va b1 nol bo'lsa). Hozircha biz "pastga siljiymiz va biz nol bayroqni tekshiramiz" deb ishlatdik. Ushbu talabni bekor qilsak, boshqa hiylalarni ishlatishimiz mumkin: biz o`zimizni o`zgartiramiz va natijani yangilash kerakmi yoki yo'qligini bilish uchun uskuna bitini tekshiramiz . Buyruqlar to'plamini hisobga olgan holda, "tushuntirish" kodi yordamida ko'rsatmalar quyidagi holga aylanadi.

//Input: a = a1 * 256 + a0, b = b1 * 256 + b0
//Output: r = r1 * 256 + r0

Preliminary:
P0 r0 = 0 (CLR)
P1 r1 = 0 (CLR)

Main block:
0 Shift right b0 (LSR)
1 If carry is not set skip 2 instructions = jump to 4 (BRCC)
2 r0 = r0 + a0 (ADD)
3 r1 = r1 + a1 + carry from prev. (ADC)
4 Shift right b1 (LSR)
5 If carry is not set skip 1 instruction = jump to 7 (BRCC)
6 r1 = r1 + a0 (ADD)
7 a0 = a0 + a0 (ADD)  
8 a1 = a1 + a1 + carry from prev. (ADC)

[Repeat same instructions for another 7 times]

Filialda hech qanday sakrash bo'lmasa, 1 ta ko'rsatma talab etiladi. Boshqa barcha ko'rsatmalar 1 ta sikl. Shunday qilib, b1 holati cycler soniga ta'sir qilmaydi, b0 = 1 bo'lsa, bizda 9 tsikl bor va b0 = 0 bo'lsa, 8 tsikl bor. Initializatsiyani hisoblash, 8 yineleme va a0 va a1 ning oxirgi yangilanishini atash, (b0 = 11111111b), bizda 8 * 9 + 2 - 2 = 72 tsiklli mavjud. Qaysi C ++ dasturini ishlab chiqaruvchi derazani ishlab chiqarishga ishontirishini bilmasdim. Balki:

 void iterate(uint8_t& b0,uint8_t& b1,uint16_t& a, uint16_t& r) {
     const uint8_t temp0 = b0;
     b0 >>=1;
     if (temp0 & 0x01) {//Will this convince him to use the carry flag?
         r += a;
     }
     const uint8_t temp1 = b1;
     b1 >>=1;
     if (temp1 & 0x01) {
         r+=(uint16_t)((uint8_t)(a & 0xff))*256;
     }
     a += a;
 }

 uint16_t umul16_(uint16_t a, uint16_t b) {
     uint16_t r = 0;
     uint8_t b0 = b & 0xff;
     uint8_t b1 = b >>8;

     iterate(b0,b1,a,r);
     iterate(b0,b1,a,r);
     iterate(b0,b1,a,r);
     iterate(b0,b1,a,r);
     iterate(b0,b1,a,r);
     iterate(b0,b1,a,r);
     iterate(b0,b1,a,r);
     iterate(b0,b1,a,r); //Hopefully he understands he doesn't need the last update for variable a
     return r;
 }

Ammo, oldingi natijaga ko'ra, kerakli kodni olish uchun chindan ham montajga o'tish kerak!


Nihoyat, aylanish jarayonini yanada chuqurroq talqin qilish ham mumkin: sbrc/sbrs yo'riqnomalari ma'lum bir bitikni tekshirishga imkon beradi. Shuning uchun biz b0 va b1-ni almashtirishdan qochishimiz mumkin, va har bir tsikldagi boshqa bitni tekshirib ko'rishimiz mumkin. Muammo shundaki, ushbu yo'riqnomalar faqat maxsus o'tish uchun emas, balki keyingi ko'rsatmaga o'tishga imkon beradi. Shunday qilib, "tushuntirish kodi" da shunday bo'ladi:

Main block:
0 Test Nth bit of b0 (SBRS). If set jump to 2 (+ 1cycle) otherwise continue with 1
1 Jump to 4 (RJMP)
2 r0 = r0 + a0 (ADD)
3 r1 = r1 + a1 + carry from prev. (ADC)
4 Test Nth bit of (SBRC). If cleared jump to 6 (+ 1cycle) otherwise continue with 5
5 r1 = r1 + a0 (ADD)
6 a0 = a0 + a0 (ADD)  
7 a1 = a1 + a1 + carry from prev. (ADC)

Ikkinchi o'zgarish 1 ta siklni saqlab qolishga imkon beradi, ikkinchidan ikkinchisida esa aniq ustunlik yo'q. Biroq, menimcha, C ++ kodi derleyici uchun sharhni osonroq bo'lishi mumkin. A0 va a1 ning so'nggi yangilanishlarini ishga tushirish va ishga tushirish 8 tsiklni hisobga olgan holda, endi biz 64 döngü ga egamiz.

C ++ kodi:

 template
 void iterateWithMask(const uint8_t& b0,const uint8_t& b1, uint16_t& a, uint16_t& r) {
     if (b0 & mask)
         r += a;
     if (b1 & mask)
         r+=(uint16_t)((uint8_t)(a & 0xff))*256;
     a += a;
 }

 uint16_t umul16_(uint16_t a, const uint16_t b) {
     uint16_t r = 0;
     const uint8_t b0 = b & 0xff;
     const uint8_t b1 = b >>8;

     iterateWithMask<0x01>(b0,b1,a,r);
     iterateWithMask<0x02>(b0,b1,a,r);
     iterateWithMask<0x04>(b0,b1,a,r);
     iterateWithMask<0x08>(b0,b1,a,r);
     iterateWithMask<0x10>(b0,b1,a,r);
     iterateWithMask<0x20>(b0,b1,a,r);
     iterateWithMask<0x40>(b0,b1,a,r);
     iterateWithMask<0x80>(b0,b1,a,r);

     //Hopefully he understands he doesn't need the last update for a
     return r;
 }

Note that in this implementation the 0x01, 0x02 are not a real value, Ammojust a hint to the compiler to know which bit to test. Therefore, the mask cannot be obtained by shifting right: differently from all other functions seen so far, this has really no equivalent loop version.

Bu katta muammo

r+=(uint16_t)((uint8_t)(a & 0xff))*256;

r pastki registri bilan a pastki registri bilan yuqori reestrning yig'indisi bo'lishi kerak. Istaganim kabi talqin qilinmaydi. Boshqa variant:

r+=(uint16_t) 256 *((uint8_t)(a & 0xff));

6. Kompilyatorni maqbul yig'ish uchun ishonch hosil qilish

We can also keep a constant, and shift instead the result r. In this case we process b starting from the most significant bit. The complexity is equivalent, Ammoit might be easier for the compiler to digest. Also, this time we have to be careful to write explicitly the last loop, which must not do a further shift right for r.

 template
 void inverseIterateWithMask(const uint8_t& b0,const uint8_t& b1,const uint16_t& a, const uint8_t& a0, uint16_t& r) {
     if (b0 & mask)
         r += a;
     if (b1 & mask)
         r+=(uint16_t)256*a0; //Hopefully easier to understand for the compiler?
     r += r;
 }

 uint16_t umul16_(const uint16_t a, const uint16_t b) {
     uint16_t r = 0;
     const uint8_t b0 = b & 0xff;
     const uint8_t b1 = b >>8;
     const uint8_t a0 = a & 0xff;

     inverseIterateWithMask<0x80>(b0,b1,a,r);
     inverseIterateWithMask<0x40>(b0,b1,a,r);
     inverseIterateWithMask<0x20>(b0,b1,a,r);
     inverseIterateWithMask<0x10>(b0,b1,a,r);
     inverseIterateWithMask<0x08>(b0,b1,a,r);
     inverseIterateWithMask<0x04>(b0,b1,a,r);
     inverseIterateWithMask<0x02>(b0,b1,a,r);

     //Last iteration:
     if (b0 & 0x01)
         r += a;
     if (b1 & 0x01)
         r+=(uint16_t)256*a0;

     return r;
 }
17
qo'shib qo'ydi
@Flexo Men so'nggi qismni qo'shib oldingi tahrirlarni biroz o'zgartirdim. Keyinchalik yaxshilanish uchun joyni ko'rsangiz, menga xabar bering.
qo'shib qo'ydi muallif Antonio, manba
@SergioFormiggini Menimcha, bu yerda topilgan narsalar - juda yaxshi. Mening javobim ushbu versiyasi ga qarang: Siz taklifingizdan nusxa ko'chirishga ega bo'lganman. Keyin esa - versiyasini olib tashladim. Shu bois, bu erda nimalar samarali ekanligiga ishonaman. Men javobimni ajratish haqida o'ylardim, lekin uni indeks bilan saqlashni afzal ko'raman, aks holda u faqatgina yig'ishni istayman (= to'plash) obro'siga o'xshash bo'ladi, men izlayotgan narsa esa mukammal va to'liq javobdir :).
qo'shib qo'ydi muallif Antonio, manba
@SergioFormiggini Yuqori darajali kod (yoki pastroq bo'lsa-da, pastroq bo'lsa-da, pastroq bo'lsa-da), yoki tezkor kodni (bu holda loopni ochish usullaridan biridir) aniqlay olasizmi?
qo'shib qo'ydi muallif Antonio, manba
qo'shib qo'ydi muallif Antonio, manba
@greybeard Bundan tashqari sbrc dan oldin yana lsr kerak. Izoh: bcc bu holda brcc .
qo'shib qo'ydi muallif Antonio, manba
Javobingizni "tahrirlash uchun sarlavha" va "tahrirlash" ni yozmasdan yoza olasizmi? O'qishni juda yaxshi biladi, uni birovni noldan o'qish uchun ishlaydigan va boshqalarni tahrir qilish tarixini/sharhini kerakli tarzda sharhlab berishga imkon beradigan hikoya bilan yozishni juda yaxshi biladi. (Ehtimol, matnli kitobdagi bobni tayyorlash kabi).
qo'shib qo'ydi muallif Flexo, manba
@Antonio kodi bilan emas, balki-emas. Tahrir qilishga harakat qildim, lekin o'zgartirish juda kichik! :)
qo'shib qo'ydi muallif Sir Jo Black, manba
@Antonio, sizning javobingiz ko'proq javob bo'lishi mumkin deb o'ylayman, shuning uchun o'qiganlarning barchasi turli xil algoritmlarni va aniq javobda tegishli sharhlarni ko'rishlari mumkin!
qo'shib qo'ydi muallif Sir Jo Black, manba
Ha, men tezkor kodni talab qilaman. Kompilyatsiya qilmoqchi bo'lgan kodni kompilyatsiya qilish kerak - chunki AVR xotirasi faqat 8 Kb Flash. Kodni mening muhitimda tuzdim. Endi men urinib ko'rmoqchi edim-ku, lekin menimcha, bu farq minimal bo'ladi! Men faqat ushbu kodni va "ISR" ni yozmoqchi deb o'ylayman.
qo'shib qo'ydi muallif Sir Jo Black, manba
@Antonio, mening muammom ko'payishning tezkor usuliga ega bo'lish edi, balki kichik kodni ishlab chiqarishdir (men sizning savolingizga javob sifatida o'qib chiqsangiz)! Men aytganimdek, uni tarqatadigan ko'paytirgichlarni hal qildim, lekin ko'paytirish algoritmi haqidagi munozarani to'xtatib qo'ydim va keyin uni sinovdan o'tkazaman!
qo'shib qo'ydi muallif Sir Jo Black, manba
lsr yashirinligini saqlash nimani anglatadi adc sbrc qo'shing lsl roli ?
qo'shib qo'ydi muallif greybeard, manba

Xulosa

  1. a va b (original taklif) almashtirishni ko'rib chiqing
  2. Shartli otlar (yaxshi optimallash emas) oldini olishga urinish
  3. Kirish formulasini qayta tiklash (taxminan 35% daromad)
  4. Ikkilamchi siljishni olib tashlash
  5. Loopni ochish: "maqbul" montaj
  6. Kompilyatorni maqbul yig'ish uchun ishonch hosil qiling.

1. Consider swapping a and b

One improvement would be to first compare a and b, and swap them if a: you should use as b the smaller of the two, so that you have the minimum number of cycles. Note that you can avoid swapping by duplicating the code (if (a then jump to a mirrored code section), AmmoI doubt it's worth.


2. Shartli o'tishlardan qochishga urinib ko'rish (muvaffaqiyatli optimallash emas)

Sinash:

uint16_t umul16_(uint16_t a, uint16_t b)
{
    ///Here swap if necessary
    uint16_t accum=0;

    while (b) {
        accum += ((b&1) * uint16_t(0xffff)) & a; //Hopefully this multiplication is optimized away
        b>>=1;
        a+=a;
    }

    return accum;
}

Serxionning fikr-mulohazasidan kelib chiqqan holda, bu yaxshilanishlarni keltirib chiqarmadi.


3. Kirish formulasini qayta ishlash

Maqsad me'morchiligi asosan 8bit yo'riqnomaga ega ekanligini hisobga olsak, kirish va o'zgaruvchining yuqori va quyi 8 bitini ajratsangiz, quyidagilarni yozishingiz mumkin:

a = a1 * 0xff + a0;
b = b1 * 0xff + b0;

a * b = a1 * b1 * 0xffff + a0 * b1 * 0xff + a1 * b0 * 0xff + a0 * b0

Endi esa, 0xffff sizning registringizdan ga yuborishi sababli, a1 * b1 * 0xffff so'zini tashlay olamiz.

(16bit) a * b = a0 * b1 * 0xff + a1 * b0 * 0xff + a0 * b0

Bundan tashqari, a0 * b1 va a1 * b0 atamasi 0xff tufayli 8bitli ko'paytma sifatida ko'rib chiqilishi mumkin: 256 registrdan chiqarilgan .

So far exciting! ... But, here comes reality striking: a0 * b0 has to be treated as a 16 bit multiplication, as you'll have to keep all resulting bits. a0 will have to be kept on 16 bit to allow shift lefts. This multiplication has half of the iterations of a * b, it is in part 8bit (because of b0) Ammoyou still have to take into account the 2 8bit multiplications mentioned before, and the final result composition. We need further reshaping!

Shuning uchun endi b0 ni yig'aman.

(16bit) a * b = a0 * b1 * 0xff + b0 * (a0 + a1 * 0xff)

Ammo

(a0 + a1 * 0xff) = a

Shunday qilib:

(16bit) a * b = a0 * b1 * 0xff + b0 * a

N = asl nusxada a * b ning aylanishlari bo'lsa, hozirda birinchi atama N/2 aylanishi bilan 8bitli ko'paytma, ikkinchisi N/2 aylanishi bilan 16bit * 8bitlik ko'paytirish. M kodi a * b ga o'zgartirilganda ko'rsatmalarning sonini hisobga olgan holda, 8bit * 8bit iteratsiya yo'riqnomaning yarmiga ega, va taxminan 80% dan 16bit * 8bit (bir o'zgarish buyrug'i b0 b) ga nisbatan. Birgalikda N/2 * M/2 + N/2 * M * 0.8 = N * M * 0.65 murakkabligi bor, shuning uchun <35% original N * M ga nisbatan hurmat ko'rsatiladi. promising so'zlari tovushlar.

Bu kod:

uint16_t umul16_(uint16_t a, uint16_t b)
{
    uint8_t res1 = 0;

    uint8_t a0 = a & 0xff; //This effectively needs to copy the data
    uint8_t b0 = b & 0xff; //This should be optimized away
    uint8_t b1 = b >>8; //This should be optimized away

    //Here a0 and b1 could be swapped (to have b1 < a0)
    while (b1) {///Maximum 8 cycles
        if ( (b1 & 1) )
            res1+=a0;
        b1>>=1;
        a0+=a0;
    }

    uint16_t res = (uint16_t) res1 * 256; //Should be optimized away, it's not even a copy!

    //Here swapping wouldn't make much sense
    while (b0) {///Maximum 8 cycles
        if ( (b0 & 1) )
            res+=a;
        b0>>=1;
        a+=a;
    }

    return res;
}

Bundan tashqari, 2 tsikldagi bo'linish, nazariy jihatdan, bir nechta davrlarni o'tkazish imkoniyatini ikki baravar oshirish kerak: N/2 engil ortiqcha bo'lishi mumkin.

a o'zgaruvchilari uchun keraksiz, keyingi kaymalardan saqlanish uchun kichikroq qo'shimcha yaxshilanish. Kichik yon eslatma: agar b0 yoki b1 nol bo'lsa, u holda qo'shimcha 2 ta ko'rsatma bo'ladi. Ammo , shuningdek, eng qimmat bo'lgan b0 va b1 ning birinchi tekshiruvini saqlab qoladi, chunki u nolinchi bayroq holati uchun shartli o'tish uchun o'tish ishidan tekshirolmaydi. pastadir.

uint16_t umul16_(uint16_t a, uint16_t b)
{
    uint8_t res1 = 0;

    uint8_t a0 = a & 0xff; //This effectively needs to copy the data
    uint8_t b0 = b & 0xff; //This should be optimized away
    uint8_t b1 = b >>8; //This should be optimized away

    //Here a0 and b1 could be swapped (to have b1 < a0)
    if ( (b1 & 1) )
        res1+=a0;
    b1>>=1;
    while (b1) {///Maximum 7 cycles
        a0+=a0;
        if ( (b1 & 1) )
            res1+=a0;
        b1>>=1;
    }

    uint16_t res = (uint16_t) res1 * 256; //Should be optimized away, it's not even a copy!

    //Here swapping wouldn't make much sense
    if ( (b0 & 1) )
        res+=a;
    b0>>=1;
    while (b0) {///Maximum 7 cycles
        a+=a;
        if ( (b0 & 1) )
            res+=a;
        b0>>=1;
    }

    return res;
}

4. Ikkilamchi shovqinni olib tashlash

Rivojlanish uchun bo'sh joy bormi? Ha , a0 da baytlar ikki marta </​​em> kaydırılır. Shunday qilib, ikkita ildizni birlashtirishda foyda bo'lishi kerak. Compilerni, biz xohlagan narsani qilish uchun, ayniqsa natija qaydnomasi bilan bajarish uchun ishontirish uchun biroz qiyin bo'lishi mumkin.

Shunday qilib, biz bir xil aylanish jarayonida b0 va b1 . Dastlabki narsa, loop chiqish holati qanday? Hozircha b0 / b1 yordamida vaziyatni o'chirib qo'yish foydalidir, chunki u taymerdan foydalanishni taqiqlaydi. Bundan tashqari, o'ngga siljishdan so'ng, operatsiya natijasi nolga teng bo'lsa, bayroq belgilanishi mumkin va bu bayroq qo'shimcha baholashsiz shartli o'tishga ruxsat berishi mumkin.

Now the loop exit condition could be the failure of (b0 || b1). However this could require expensive computation. One solution is to compare b0 and b1 and jump to 2 different code sections: if b1 > b0 I test the condition on b1, else I test the condition on b0. I prefer another solution, with 2 loops, the first exit when b0 is zero, the second when b1 is zero. There will be cases in which I will do zero iterations in b1. The point is that in the second loop I know b0 is zero, so I can reduce the number of operations performed.

Keling, exit holatini unutamiz va avvalgi bo'limning 2 ta qismiga qo'shilishga harakat qilaylik.

uint16_t umul16_(uint16_t a, uint16_t b)
{
    uint16_t res = 0;

    uint8_t b0 = b & 0xff; //This should be optimized away
    uint8_t b1 = b >>8; //This should be optimized away

    //Swapping probably doesn't make much sense anymore
    if ( (b1 & 1) )
        res+=(uint16_t)((uint8_t)(a && 0xff))*256;
    //Hopefully the compiler understands it has simply to add the low 8bit register of a to the high 8bit register of res

    if ( (b0 & 1) )
        res+=a;

    b1>>=1;
    b0>>=1;
    while (b0) {///N cycles, maximum 7
        a+=a;
        if ( (b1 & 1) )
            res+=(uint16_t)((uint8_t)(a & 0xff))*256;
        if ( (b0 & 1) )
            res+=a;
        b1>>=1;
        b0>>=1; //I try to put as last the one that will leave the carry flag in the desired state
    }

    uint8_t a0 = a & 0xff; //Again, not a real copy Ammoa register selection

    while (b1) {///P cycles, maximum 7 - N cycles
        a0+=a0;
        if ( (b1 & 1) )
            res+=(uint16_t) a0 * 256;
        b1>>=1;
    }
    return res;
}

Thanks Sergio for providing the assembly generated (-Ofast). At first glance, considering the outrageous amount of mov in the code, it seems the compiler did not interpret as I wanted the hints I gave to him to interpret the registers.

Inputs are: r22,r23 and r24,25.
AVR Instruction Set: Quick reference, Detailed documentation

sbrs //Tests a single bit in a register and skips the next instruction if the bit is set. Skip takes 2 clocks. 
ldi//Load immediate, 1 clock
sbiw//Subtracts immediate to *word*, 2 clocks

    00000010 :
      10:    70 ff           sbrs    r23, 0
      12:    39 c0           rjmp    .+114        ; 0x86 <__SREG__+0x47>
      14:    41 e0           ldi    r20, 0x01    ; 1
      16:    00 97           sbiw    r24, 0x00    ; 0
      18:    c9 f1           breq    .+114        ; 0x8c <__SREG__+0x4d>
      1a:    34 2f           mov    r19, r20
      1c:    20 e0           ldi    r18, 0x00    ; 0
      1e:    60 ff           sbrs    r22, 0
      20:    07 c0           rjmp    .+14         ; 0x30 
      22:    28 0f           add    r18, r24
      24:    39 1f           adc    r19, r25
      26:    04 c0           rjmp    .+8          ; 0x30 
      28:    e4 2f           mov    r30, r20
      2a:    45 2f           mov    r20, r21
      2c:    2e 2f           mov    r18, r30
      2e:    34 2f           mov    r19, r20
      30:    76 95           lsr    r23
      32:    66 95           lsr    r22
      34:    b9 f0           breq    .+46         ; 0x64 <__SREG__+0x25>
      36:    88 0f           add    r24, r24
      38:    99 1f           adc    r25, r25
      3a:    58 2f           mov    r21, r24
      3c:    44 27           eor    r20, r20
      3e:    42 0f           add    r20, r18
      40:    53 1f           adc    r21, r19
      42:    70 ff           sbrs    r23, 0
      44:    02 c0           rjmp    .+4          ; 0x4a <__SREG__+0xb>
      46:    24 2f           mov    r18, r20
      48:    35 2f           mov    r19, r21
      4a:    42 2f           mov    r20, r18
      4c:    53 2f           mov    r21, r19
      4e:    48 0f           add    r20, r24
      50:    59 1f           adc    r21, r25
      52:    60 fd           sbrc    r22, 0
      54:    e9 cf           rjmp    .-46         ; 0x28 
      56:    e2 2f           mov    r30, r18
      58:    43 2f           mov    r20, r19
      5a:    e8 cf           rjmp    .-48         ; 0x2c 
      5c:    95 2f           mov    r25, r21
      5e:    24 2f           mov    r18, r20
      60:    39 2f           mov    r19, r25
      62:    76 95           lsr    r23
      64:    77 23           and    r23, r23
      66:    61 f0           breq    .+24         ; 0x80 <__SREG__+0x41>
      68:    88 0f           add    r24, r24
      6a:    48 2f           mov    r20, r24
      6c:    50 e0           ldi    r21, 0x00    ; 0
      6e:    54 2f           mov    r21, r20
      70:    44 27           eor    r20, r20
      72:    42 0f           add    r20, r18
      74:    53 1f           adc    r21, r19
      76:    70 fd           sbrc    r23, 0
      78:    f1 cf           rjmp    .-30         ; 0x5c <__SREG__+0x1d>
      7a:    42 2f           mov    r20, r18
      7c:    93 2f           mov    r25, r19
      7e:    ef cf           rjmp    .-34         ; 0x5e <__SREG__+0x1f>
      80:    82 2f           mov    r24, r18
      82:    93 2f           mov    r25, r19
      84:    08 95           ret
      86:    20 e0           ldi    r18, 0x00    ; 0
      88:    30 e0           ldi    r19, 0x00    ; 0
      8a:    c9 cf           rjmp    .-110        ; 0x1e 
      8c:    40 e0           ldi    r20, 0x00    ; 0
      8e:    c5 cf           rjmp    .-118        ; 0x1a 

5. Loopni ochish: "maqbul" montaj

Ushbu ma'lumotlarning barchasida, keling, me'moriy cheklovlarni hisobga olgan holda, "maqbul" echim qanday bo'lishini tushunishga harakat qilaylik. "Optimal" so'zi keltirilgan, chunki "maqbul" narsa kirish ma'lumotlariga va optimallashtirishni xohlayotgan narsalarga bog'liqdir. Eng yomon holatlarda davriy sonlar sonini optimallashtirishni xohlaymiz deb taxmin qilamiz. . Agar biz eng yomon vaziyatga boradigan bo'lsak, loop unrolling o'rtacha tanlovdir: 8 tsikl borligini bilamiz va biz tugalladik (agar b0 va b1 nol bo'lsa). Hozircha biz "pastga siljiymiz va biz nol bayroqni tekshiramiz" deb ishlatdik. Ushbu talabni bekor qilsak, boshqa hiylalarni ishlatishimiz mumkin: biz o`zimizni o`zgartiramiz va natijani yangilash kerakmi yoki yo'qligini bilish uchun uskuna bitini tekshiramiz . Buyruqlar to'plamini hisobga olgan holda, "tushuntirish" kodi yordamida ko'rsatmalar quyidagi holga aylanadi.

//Input: a = a1 * 256 + a0, b = b1 * 256 + b0
//Output: r = r1 * 256 + r0

Preliminary:
P0 r0 = 0 (CLR)
P1 r1 = 0 (CLR)

Main block:
0 Shift right b0 (LSR)
1 If carry is not set skip 2 instructions = jump to 4 (BRCC)
2 r0 = r0 + a0 (ADD)
3 r1 = r1 + a1 + carry from prev. (ADC)
4 Shift right b1 (LSR)
5 If carry is not set skip 1 instruction = jump to 7 (BRCC)
6 r1 = r1 + a0 (ADD)
7 a0 = a0 + a0 (ADD)  
8 a1 = a1 + a1 + carry from prev. (ADC)

[Repeat same instructions for another 7 times]

Filialda hech qanday sakrash bo'lmasa, 1 ta ko'rsatma talab etiladi. Boshqa barcha ko'rsatmalar 1 ta sikl. Shunday qilib, b1 holati cycler soniga ta'sir qilmaydi, b0 = 1 bo'lsa, bizda 9 tsikl bor va b0 = 0 bo'lsa, 8 tsikl bor. Initializatsiyani hisoblash, 8 yineleme va a0 va a1 ning oxirgi yangilanishini atash, (b0 = 11111111b), bizda 8 * 9 + 2 - 2 = 72 tsiklli mavjud. Qaysi C ++ dasturini ishlab chiqaruvchi derazani ishlab chiqarishga ishontirishini bilmasdim. Balki:

 void iterate(uint8_t& b0,uint8_t& b1,uint16_t& a, uint16_t& r) {
     const uint8_t temp0 = b0;
     b0 >>=1;
     if (temp0 & 0x01) {//Will this convince him to use the carry flag?
         r += a;
     }
     const uint8_t temp1 = b1;
     b1 >>=1;
     if (temp1 & 0x01) {
         r+=(uint16_t)((uint8_t)(a & 0xff))*256;
     }
     a += a;
 }

 uint16_t umul16_(uint16_t a, uint16_t b) {
     uint16_t r = 0;
     uint8_t b0 = b & 0xff;
     uint8_t b1 = b >>8;

     iterate(b0,b1,a,r);
     iterate(b0,b1,a,r);
     iterate(b0,b1,a,r);
     iterate(b0,b1,a,r);
     iterate(b0,b1,a,r);
     iterate(b0,b1,a,r);
     iterate(b0,b1,a,r);
     iterate(b0,b1,a,r); //Hopefully he understands he doesn't need the last update for variable a
     return r;
 }

Ammo, oldingi natijaga ko'ra, kerakli kodni olish uchun chindan ham montajga o'tish kerak!


Nihoyat, aylanish jarayonini yanada chuqurroq talqin qilish ham mumkin: sbrc/sbrs yo'riqnomalari ma'lum bir bitikni tekshirishga imkon beradi. Shuning uchun biz b0 va b1-ni almashtirishdan qochishimiz mumkin, va har bir tsikldagi boshqa bitni tekshirib ko'rishimiz mumkin. Muammo shundaki, ushbu yo'riqnomalar faqat maxsus o'tish uchun emas, balki keyingi ko'rsatmaga o'tishga imkon beradi. Shunday qilib, "tushuntirish kodi" da shunday bo'ladi:

Main block:
0 Test Nth bit of b0 (SBRS). If set jump to 2 (+ 1cycle) otherwise continue with 1
1 Jump to 4 (RJMP)
2 r0 = r0 + a0 (ADD)
3 r1 = r1 + a1 + carry from prev. (ADC)
4 Test Nth bit of (SBRC). If cleared jump to 6 (+ 1cycle) otherwise continue with 5
5 r1 = r1 + a0 (ADD)
6 a0 = a0 + a0 (ADD)  
7 a1 = a1 + a1 + carry from prev. (ADC)

Ikkinchi o'zgarish 1 ta siklni saqlab qolishga imkon beradi, ikkinchidan ikkinchisida esa aniq ustunlik yo'q. Biroq, menimcha, C ++ kodi derleyici uchun sharhni osonroq bo'lishi mumkin. A0 va a1 ning so'nggi yangilanishlarini ishga tushirish va ishga tushirish 8 tsiklni hisobga olgan holda, endi biz 64 döngü ga egamiz.

C ++ kodi:

 template
 void iterateWithMask(const uint8_t& b0,const uint8_t& b1, uint16_t& a, uint16_t& r) {
     if (b0 & mask)
         r += a;
     if (b1 & mask)
         r+=(uint16_t)((uint8_t)(a & 0xff))*256;
     a += a;
 }

 uint16_t umul16_(uint16_t a, const uint16_t b) {
     uint16_t r = 0;
     const uint8_t b0 = b & 0xff;
     const uint8_t b1 = b >>8;

     iterateWithMask<0x01>(b0,b1,a,r);
     iterateWithMask<0x02>(b0,b1,a,r);
     iterateWithMask<0x04>(b0,b1,a,r);
     iterateWithMask<0x08>(b0,b1,a,r);
     iterateWithMask<0x10>(b0,b1,a,r);
     iterateWithMask<0x20>(b0,b1,a,r);
     iterateWithMask<0x40>(b0,b1,a,r);
     iterateWithMask<0x80>(b0,b1,a,r);

     //Hopefully he understands he doesn't need the last update for a
     return r;
 }

Note that in this implementation the 0x01, 0x02 are not a real value, Ammojust a hint to the compiler to know which bit to test. Therefore, the mask cannot be obtained by shifting right: differently from all other functions seen so far, this has really no equivalent loop version.

Bu katta muammo

r+=(uint16_t)((uint8_t)(a & 0xff))*256;

r pastki registri bilan a pastki registri bilan yuqori reestrning yig'indisi bo'lishi kerak. Istaganim kabi talqin qilinmaydi. Boshqa variant:

r+=(uint16_t) 256 *((uint8_t)(a & 0xff));

6. Kompilyatorni maqbul yig'ish uchun ishonch hosil qilish

We can also keep a constant, and shift instead the result r. In this case we process b starting from the most significant bit. The complexity is equivalent, Ammoit might be easier for the compiler to digest. Also, this time we have to be careful to write explicitly the last loop, which must not do a further shift right for r.

 template
 void inverseIterateWithMask(const uint8_t& b0,const uint8_t& b1,const uint16_t& a, const uint8_t& a0, uint16_t& r) {
     if (b0 & mask)
         r += a;
     if (b1 & mask)
         r+=(uint16_t)256*a0; //Hopefully easier to understand for the compiler?
     r += r;
 }

 uint16_t umul16_(const uint16_t a, const uint16_t b) {
     uint16_t r = 0;
     const uint8_t b0 = b & 0xff;
     const uint8_t b1 = b >>8;
     const uint8_t a0 = a & 0xff;

     inverseIterateWithMask<0x80>(b0,b1,a,r);
     inverseIterateWithMask<0x40>(b0,b1,a,r);
     inverseIterateWithMask<0x20>(b0,b1,a,r);
     inverseIterateWithMask<0x10>(b0,b1,a,r);
     inverseIterateWithMask<0x08>(b0,b1,a,r);
     inverseIterateWithMask<0x04>(b0,b1,a,r);
     inverseIterateWithMask<0x02>(b0,b1,a,r);

     //Last iteration:
     if (b0 & 0x01)
         r += a;
     if (b1 & 0x01)
         r+=(uint16_t)256*a0;

     return r;
 }
17
qo'shib qo'ydi
@Flexo Men so'nggi qismni qo'shib oldingi tahrirlarni biroz o'zgartirdim. Keyinchalik yaxshilanish uchun joyni ko'rsangiz, menga xabar bering.
qo'shib qo'ydi muallif Antonio, manba
@SergioFormiggini Menimcha, bu yerda topilgan narsalar - juda yaxshi. Mening javobim ushbu versiyasi ga qarang: Siz taklifingizdan nusxa ko'chirishga ega bo'lganman. Keyin esa - versiyasini olib tashladim. Shu bois, bu erda nimalar samarali ekanligiga ishonaman. Men javobimni ajratish haqida o'ylardim, lekin uni indeks bilan saqlashni afzal ko'raman, aks holda u faqatgina yig'ishni istayman (= to'plash) obro'siga o'xshash bo'ladi, men izlayotgan narsa esa mukammal va to'liq javobdir :).
qo'shib qo'ydi muallif Antonio, manba
@SergioFormiggini Yuqori darajali kod (yoki pastroq bo'lsa-da, pastroq bo'lsa-da, pastroq bo'lsa-da), yoki tezkor kodni (bu holda loopni ochish usullaridan biridir) aniqlay olasizmi?
qo'shib qo'ydi muallif Antonio, manba
qo'shib qo'ydi muallif Antonio, manba
@greybeard Bundan tashqari sbrc dan oldin yana lsr kerak. Izoh: bcc bu holda brcc .
qo'shib qo'ydi muallif Antonio, manba
Javobingizni "tahrirlash uchun sarlavha" va "tahrirlash" ni yozmasdan yoza olasizmi? O'qishni juda yaxshi biladi, uni birovni noldan o'qish uchun ishlaydigan va boshqalarni tahrir qilish tarixini/sharhini kerakli tarzda sharhlab berishga imkon beradigan hikoya bilan yozishni juda yaxshi biladi. (Ehtimol, matnli kitobdagi bobni tayyorlash kabi).
qo'shib qo'ydi muallif Flexo, manba
@Antonio kodi bilan emas, balki-emas. Tahrir qilishga harakat qildim, lekin o'zgartirish juda kichik! :)
qo'shib qo'ydi muallif Sir Jo Black, manba
@Antonio, sizning javobingiz ko'proq javob bo'lishi mumkin deb o'ylayman, shuning uchun o'qiganlarning barchasi turli xil algoritmlarni va aniq javobda tegishli sharhlarni ko'rishlari mumkin!
qo'shib qo'ydi muallif Sir Jo Black, manba
Ha, men tezkor kodni talab qilaman. Kompilyatsiya qilmoqchi bo'lgan kodni kompilyatsiya qilish kerak - chunki AVR xotirasi faqat 8 Kb Flash. Kodni mening muhitimda tuzdim. Endi men urinib ko'rmoqchi edim-ku, lekin menimcha, bu farq minimal bo'ladi! Men faqat ushbu kodni va "ISR" ni yozmoqchi deb o'ylayman.
qo'shib qo'ydi muallif Sir Jo Black, manba
@Antonio, mening muammom ko'payishning tezkor usuliga ega bo'lish edi, balki kichik kodni ishlab chiqarishdir (men sizning savolingizga javob sifatida o'qib chiqsangiz)! Men aytganimdek, uni tarqatadigan ko'paytirgichlarni hal qildim, lekin ko'paytirish algoritmi haqidagi munozarani to'xtatib qo'ydim va keyin uni sinovdan o'tkazaman!
qo'shib qo'ydi muallif Sir Jo Black, manba
lsr yashirinligini saqlash nimani anglatadi adc sbrc qo'shing lsl roli ?
qo'shib qo'ydi muallif greybeard, manba

Derleyici sizning akkumulyatoringiz uchun 'a' ni qo'shib qo'yishni tanlash uchun uchinchi operatordan foydalanib, sinov kodi va filialning narxiga va kompilyatoringiz kodini qanday hosil qilishiga bog'liq holda qisqa kod ishlab chiqishi mumkin.

Loop sonini kamaytirish uchun argumentlarni almashtiring.

uint16_t umul16_(uint16_t op1, uint16_t op2)
{
    uint16_t accum=0;
    uint16_t a, b;
    a=op1; b=op2;
    if( op1>=1;
        a+=a;
    }

    return accum;
}

Ko'p yillar oldin, men "oldinga" deb yozdim, bu filialni emas, hisob-kitoblarni ilgari surdi,

Sinovni butunlay olib tashlash uchun derazadan foydalanishingiz mumkin, bu sizning kompilyatoringiz ofsetdan yuk sifatida hosil qilish uchun foydalanishi mumkin. 0 va a qiymatlarini o'z ichiga olgan qatorni belgilang va pastki oxiridagi qiymatni yangilang,

uint16_t umul16_(uint16_t op1, uint16_t op2)
{
    uint16_t accum=0;
    uint16_t pick[2];
    uint16_t a, b;
    a=op1; b=op2;
    if( op1>=1;
        pick[1] += pick[1]; //(a+=a);
    }

    return accum;
}

Ha, yomon . Lekin odatdagidek kodni shu kabi yozmayman.

Tartibga solish - op1 yoki op2 kichkinagidan pastroqqa almashtirishni (kamroq o'tishni) qo'shish uchun yangilangan. Bu test uchun foydadan voz kechish = 0.

4
qo'shib qo'ydi
@SergioFormiggini Siz haqiqiy tizimda benchmark kerak. Siz test qilayotgan tizim butunlay boshqacha ko'rsatmalar to'plamidan foydalanmoqda.
qo'shib qo'ydi muallif Antonio, manba
Viktorinani nolga tenglashtiruvchi test bilan bog'liq: agar argumentlarni takliflarimga muvofiq o'zgartirsangiz, iteratsiya sharti birinchi tekshiruvda muvaffaqiyatsizlikka uchraydi, shuning uchun deyarli hech qanday foyda yo'q, bu funksiya chaqirilganda har safar amalga oshiriladigan tekshiruvni qo'shasiz.
qo'shib qo'ydi muallif Antonio, manba
Intersting! ;) Men sinov qilishim kerak ...;)
qo'shib qo'ydi muallif Sir Jo Black, manba
Men uni yozdim ... Uchinchi operator bilan biroz murakkab, vektor bilan katta prologiya va epilogga ta'sir qiladi! Lekin kod juda yoqimli! :)
qo'shib qo'ydi muallif Sir Jo Black, manba
Xulosa qilib aytadigan bo'lsak, g'alati, ammo undan foydalanadigan funktsiyani yaxshiroq tuzilgan ko'rinadi!
qo'shib qo'ydi muallif Sir Jo Black, manba
Vektor bilan bu derleyici vektorning tarkibini to'g'ridan-to'g'ri manipulyatsiya qilish uchun registrlardan foydalana olmaslik muammosiga ega.
qo'shib qo'ydi muallif Sir Jo Black, manba
Sizga yozib olingan kod haqida yozdim, men samarali tezlikni sinab ko'rmadim. Endi men yana Intel I3 bilan harakat qilishim mumkin, lekin men AVR bilan harakat qilishim kerak ...;)
qo'shib qo'ydi muallif Sir Jo Black, manba
Yuqorida aytib o'tganimdek, 16-bit operand ishlatilganda avr-kompilyatorni ko'raman, odatda chap tomondan foydalanganda qo'shilasiz! (Men nima bilmayman, chunki ular bir xil sonli döngüleri foydalanadi: 1)
qo'shib qo'ydi muallif Sir Jo Black, manba
Ba'zi mezonlarni bajarishim kerak ... lekin bugungi kun uchun vaqt yo'q ... :)
qo'shib qo'ydi muallif Sir Jo Black, manba
Vektorli yechimingiz (LUTdan tashqari, AVR MCUs me'morchiligi uchun yaxshi emas, deb o'ylayman, lekin men sinovdan o'taman) @Antonio taklifiga qo'shilgan tezroq biroz tezroq! Funktsiyalarni Intel I3 va 10 bittadan 16 bitli sonli juftliklarning ko'paytirilishi loopi bilan sinab ko'rdim.
qo'shib qo'ydi muallif Sir Jo Black, manba
Men kesh tufayli biror narsa mening testni buzganini tushunaman! Ushbu savolga boshlovchi muntazam bir e b almashtirishning foydasi oz bo'lsa-da, vektorlashtirilgan versiya almashtirishsiz ishlaydi! (Men Intel I3da sinab ko'rmoqdaman, men AVRda ko'rishim kerak)
qo'shib qo'ydi muallif Sir Jo Black, manba
Men Antonionni bilaman! Men umumiy fikrga ega bo'lishni sinab ko'rdim! I3 me'morchiligini bilmayman! Men AVR bilan test qilishim kerak!
qo'shib qo'ydi muallif Sir Jo Black, manba
Bilaman, men har bir chaqiriq uchun testlarni emas, balki katta sinovni tayyorlayman ... Muammo maqsad emas, balki biror narsaning haqiqiyligi! ... AVRlarning keshlari yo'q, ichki optimallashtirish, filialni prognoz qilish va h.k. yo'qligini bilaman. Ular sekin, lekin uni ishlatish MCU xatti-harakatlarini buzish orqali bilishimiz mumkin!
qo'shib qo'ydi muallif Sir Jo Black, manba
Sizning CPU buyumni [] elementlarini keshlashi mumkin va keshda kirishni ro'yxatdan o'tkazish uchun farqni hisobga olgan holda, filialning narxiga nisbatan, siz yaxshilanishlarni topishingiz mumkin. Lekin bu turli .
qo'shib qo'ydi muallif ChuckCottrill, manba
Va tanlov [1] + = [1] ni tanlang; tezroq yozish mumkin [1] << = 1;
qo'shib qo'ydi muallif ChuckCottrill, manba
Yaxshi echim uchun @Antonio eritmasini va ma'danni birlashtir.
qo'shib qo'ydi muallif ChuckCottrill, manba

Derleyici sizning akkumulyatoringiz uchun 'a' ni qo'shib qo'yishni tanlash uchun uchinchi operatordan foydalanib, sinov kodi va filialning narxiga va kompilyatoringiz kodini qanday hosil qilishiga bog'liq holda qisqa kod ishlab chiqishi mumkin.

Loop sonini kamaytirish uchun argumentlarni almashtiring.

uint16_t umul16_(uint16_t op1, uint16_t op2)
{
    uint16_t accum=0;
    uint16_t a, b;
    a=op1; b=op2;
    if( op1>=1;
        a+=a;
    }

    return accum;
}

Ko'p yillar oldin, men "oldinga" deb yozdim, bu filialni emas, hisob-kitoblarni ilgari surdi,

Sinovni butunlay olib tashlash uchun derazadan foydalanishingiz mumkin, bu sizning kompilyatoringiz ofsetdan yuk sifatida hosil qilish uchun foydalanishi mumkin. 0 va a qiymatlarini o'z ichiga olgan qatorni belgilang va pastki oxiridagi qiymatni yangilang,

uint16_t umul16_(uint16_t op1, uint16_t op2)
{
    uint16_t accum=0;
    uint16_t pick[2];
    uint16_t a, b;
    a=op1; b=op2;
    if( op1>=1;
        pick[1] += pick[1]; //(a+=a);
    }

    return accum;
}

Ha, yomon . Lekin odatdagidek kodni shu kabi yozmayman.

Tartibga solish - op1 yoki op2 kichkinagidan pastroqqa almashtirishni (kamroq o'tishni) qo'shish uchun yangilangan. Bu test uchun foydadan voz kechish = 0.

4
qo'shib qo'ydi
@SergioFormiggini Siz haqiqiy tizimda benchmark kerak. Siz test qilayotgan tizim butunlay boshqacha ko'rsatmalar to'plamidan foydalanmoqda.
qo'shib qo'ydi muallif Antonio, manba
Viktorinani nolga tenglashtiruvchi test bilan bog'liq: agar argumentlarni takliflarimga muvofiq o'zgartirsangiz, iteratsiya sharti birinchi tekshiruvda muvaffaqiyatsizlikka uchraydi, shuning uchun deyarli hech qanday foyda yo'q, bu funksiya chaqirilganda har safar amalga oshiriladigan tekshiruvni qo'shasiz.
qo'shib qo'ydi muallif Antonio, manba
Intersting! ;) Men sinov qilishim kerak ...;)
qo'shib qo'ydi muallif Sir Jo Black, manba
Men uni yozdim ... Uchinchi operator bilan biroz murakkab, vektor bilan katta prologiya va epilogga ta'sir qiladi! Lekin kod juda yoqimli! :)
qo'shib qo'ydi muallif Sir Jo Black, manba
Xulosa qilib aytadigan bo'lsak, g'alati, ammo undan foydalanadigan funktsiyani yaxshiroq tuzilgan ko'rinadi!
qo'shib qo'ydi muallif Sir Jo Black, manba
Vektor bilan bu derleyici vektorning tarkibini to'g'ridan-to'g'ri manipulyatsiya qilish uchun registrlardan foydalana olmaslik muammosiga ega.
qo'shib qo'ydi muallif Sir Jo Black, manba
Sizga yozib olingan kod haqida yozdim, men samarali tezlikni sinab ko'rmadim. Endi men yana Intel I3 bilan harakat qilishim mumkin, lekin men AVR bilan harakat qilishim kerak ...;)
qo'shib qo'ydi muallif Sir Jo Black, manba
Yuqorida aytib o'tganimdek, 16-bit operand ishlatilganda avr-kompilyatorni ko'raman, odatda chap tomondan foydalanganda qo'shilasiz! (Men nima bilmayman, chunki ular bir xil sonli döngüleri foydalanadi: 1)
qo'shib qo'ydi muallif Sir Jo Black, manba
Ba'zi mezonlarni bajarishim kerak ... lekin bugungi kun uchun vaqt yo'q ... :)
qo'shib qo'ydi muallif Sir Jo Black, manba
Vektorli yechimingiz (LUTdan tashqari, AVR MCUs me'morchiligi uchun yaxshi emas, deb o'ylayman, lekin men sinovdan o'taman) @Antonio taklifiga qo'shilgan tezroq biroz tezroq! Funktsiyalarni Intel I3 va 10 bittadan 16 bitli sonli juftliklarning ko'paytirilishi loopi bilan sinab ko'rdim.
qo'shib qo'ydi muallif Sir Jo Black, manba
Men kesh tufayli biror narsa mening testni buzganini tushunaman! Ushbu savolga boshlovchi muntazam bir e b almashtirishning foydasi oz bo'lsa-da, vektorlashtirilgan versiya almashtirishsiz ishlaydi! (Men Intel I3da sinab ko'rmoqdaman, men AVRda ko'rishim kerak)
qo'shib qo'ydi muallif Sir Jo Black, manba
Men Antonionni bilaman! Men umumiy fikrga ega bo'lishni sinab ko'rdim! I3 me'morchiligini bilmayman! Men AVR bilan test qilishim kerak!
qo'shib qo'ydi muallif Sir Jo Black, manba
Bilaman, men har bir chaqiriq uchun testlarni emas, balki katta sinovni tayyorlayman ... Muammo maqsad emas, balki biror narsaning haqiqiyligi! ... AVRlarning keshlari yo'q, ichki optimallashtirish, filialni prognoz qilish va h.k. yo'qligini bilaman. Ular sekin, lekin uni ishlatish MCU xatti-harakatlarini buzish orqali bilishimiz mumkin!
qo'shib qo'ydi muallif Sir Jo Black, manba
Sizning CPU buyumni [] elementlarini keshlashi mumkin va keshda kirishni ro'yxatdan o'tkazish uchun farqni hisobga olgan holda, filialning narxiga nisbatan, siz yaxshilanishlarni topishingiz mumkin. Lekin bu turli .
qo'shib qo'ydi muallif ChuckCottrill, manba
Va tanlov [1] + = [1] ni tanlang; tezroq yozish mumkin [1] << = 1;
qo'shib qo'ydi muallif ChuckCottrill, manba
Yaxshi echim uchun @Antonio eritmasini va ma'danni birlashtir.
qo'shib qo'ydi muallif ChuckCottrill, manba

Xo'sh, LUT va shiftning aralashmasi odatda ishlaydi

Chiziq bo'ylab bir narsa, 8 bitli narsalarni ko'paytirish. Ularni ikkita to'rtdan iborat deb hisoblang

uint4_t u1, l1, u2, l2;
uint8_t a = 16*u1 + l1;
uint8_t b = 16*u2 + l2;

product = 256*u1*u2 + 16*u1*l2 + 16*u2*l1 + l1*l1;

inline uint4_t hi( uint8_t v ) { return v >> 4; }
inline uint4_t lo( uint8_t v ) { return v & 15; }

inline uint8_t LUT( uint4_t x, uint4_t y ) {
    static uint8_t lut[256] = ...;
    return lut[x | y << 4]
}

uint16_t multiply(uint8_t a, uint8_t b) {
    return (uint16_t)LUT(hi(a), hi(b)) << 8 +
           ((uint16_t)LUT(hi(a), lo(b)) + (uint16_t)LUT(lo(a), hi(b)) << 4 +
           (uint16_t)LUT(lo(a), lo(b));
}

faqatgina lut [] ni ko'paytirish natijalari bilan to'ldiring. Sizning holatlaringizga ko'ra siz to'rtburchak (256 o'lchamli LUT) yoki bayt (65536 o'lchamli LUT) yoki biror narsa

3
qo'shib qo'ydi
@SergioFormiggini Yaxshi, fleshdan 256bytes, ehtimol, to'rtburchaklarga ko'ra 16bitni ko'paytirish, uzoqroq ifodalash, bir xil fikrni ishlatish mumkin
qo'shib qo'ydi muallif Severin Pappadeux, manba
AVRlarning bir nechta RAM bor (Tiny 85 faqat 512byte), lekin 8KByte flesh va 512 byte EEPROMga ega! ... Yana xotirasi bor AVR ham bor!
qo'shib qo'ydi muallif Sir Jo Black, manba
Men sinashga harakat qilaman! Muammo favqulodda emas va tez emas, chunki Garvard me'morchiligi AVRlardan foydalanadi.
qo'shib qo'ydi muallif Sir Jo Black, manba
qisman mahsulotni qidirish (4 × 4) yordamida 16 × 16 → 16 ga qarang va/yoki uni yaxshilab ko'ring ( wiki post ).
qo'shib qo'ydi muallif greybeard, manba

Bir yondashuv loopni ochishdir. Siz foydalanadigan platforma uchun derivatim yo'q, shuning uchun men ishlab chiqarilgan kodni ko'rib chiqa olmayman, lekin bunday yondashuv yordam berishi mumkin.

Ushbu kodning ishlashi ma'lumotlarga qaraganda kamroq bo'ladi - siz eng yaxshi holatda ekanligingizni tekshirmaguningizcha eng yomon holatda tezroq ketasiz. Kodning kattaligi biroz kattaroq, biroq qo'ng'iroqlar jadvali kattaligi emas.

(Izoh kodi tekshirilmagan, boshimdan o'chirildi, men ishlab chiqarilgan kod nimaga o'xshashligini bilib qolaman!)

#define UMUL16_STEP(a, b, shift) \
    if ((b) & (1U << (shift))) result += ((a) << (shift)));

uint16_t umul16(uint16_t a, uint16_t b)
{
    uint16_t result = 0;

    UMUL16_STEP(a, b, 0);
    UMUL16_STEP(a, b, 1);
    UMUL16_STEP(a, b, 2);
    UMUL16_STEP(a, b, 3);
    UMUL16_STEP(a, b, 4);
    UMUL16_STEP(a, b, 5);
    UMUL16_STEP(a, b, 6);
    UMUL16_STEP(a, b, 7);
    UMUL16_STEP(a, b, 8);
    UMUL16_STEP(a, b, 9);
    UMUL16_STEP(a, b, 10);
    UMUL16_STEP(a, b, 11);
    UMUL16_STEP(a, b, 12);
    UMUL16_STEP(a, b, 13);
    UMUL16_STEP(a, b, 14);
    UMUL16_STEP(a, b, 15);

    return result;
}

Yangilash:

Sizning derivoringiz nima qilayotganiga qarab, UMUL16_STEP makrosini o'zgartirishi mumkin. Shu bilan bir qatorda:

#define UMUL16_STEP(a, b, shift) \
    if ((b) & (1U << (shift))) result += (a); (a) << 1;

Ushbu yondashuv yordamida derleyici filiallarning oldini olish uchun sbrc ko'rsatmasini ishlatishi mumkin.

R1: r1 natijasi, r2: r3 a va r4: r5 b bo'lsa, unda assembler bir bitga qarash kerak deb o'ylayman:

sbrc r4, 0
add r0, r2
sbrc r4, 0
addc r1, r3
lsl r2
rol r3

Bu filialsiz doimo bajarilishi kerak. r4 da bitlarni sinab ko'ring va undan yuqori sakkiz bit uchun r5 da bitlarni sinab ko'ring. Ushbu buyruqlar ko'rsatmalarini o'qib chiqishga asoslanib 96 tsiklda ko'paytirishni bajarish kerak.

2
qo'shib qo'ydi
@janm Men hozir 64 tsikldan tushaman deb o'ylayman
qo'shib qo'ydi muallif Antonio, manba
Men sinovdan o'taman. Ammo menimcha: agar ((b) & (1U << (shift))) natijasi + = ((a) << (1U << (shift))); Agar shunday bo'lsa: ((b) & (1U << (shift))) natijasi + = (a shift). Ushbu kod jmp ko'pligini o'z ichiga oladi, biroq ba'zi o'zgarishlar bilan belgilangan vaqtni berishi mumkin.
qo'shib qo'ydi muallif Sir Jo Black, manba
Men otishma qilmaydigan echimni topdim va bu juda murakkab bo'lgan vaqtni belgilab beradi. O'zingizning echimingizni qo'shsangiz va maqsadingizni (AVR) assembleringizni tahrir qilsam bo'ladimi? UMUL16_STEP (a, b, shift) natijasi + = ((0 -! (! ((B & (1U << (shift))))) & (a < <(shift)));
qo'shib qo'ydi muallif Sir Jo Black, manba
Ha, mening yechimim ham sakrab chiqadi! CPU "C" deb ishlaydigan tuzilmaga ega emas! " o'zini tutadi!
qo'shib qo'ydi muallif Sir Jo Black, manba
Bugungi kunda men NRZI protokoli bilan ishlamaydigan muammoni hal qilishim kerak, lekin men kompyuterimda sinovdan o'tganman (bilaman, bu tarjimai hol emas). Kompyuteringizda sizning tezroq echimlaringizdan biri tezroq (kompyuterda) @ChuckCottrill ning vektorli echimi bo'lib qoladi. AVR bilan test qilsam, barchani AVR bilan sinab ko'rmoqchiman!
qo'shib qo'ydi muallif Sir Jo Black, manba
Kattalashtirish bilan bog'liq haqiqiy muammom - bu juda qattiq, vaqtida, 1/16 usec buyruqlar aylanishiga ega bo'lgan 2 ta max 3 usec dan foydalanishim mumkin, shuning uchun men o'zgacha qadriyatlardan foydalanishim kerak. Lekin ma'lum vaqt ichida bajariladigan funktsiyalarni bajarish uchun nuqta qo'yadi! :)
qo'shib qo'ydi muallif Sir Jo Black, manba
@janm! Sizning yordamingiz uchun tashakkur! Ha, buni tushunaman! Haqiqatdan ham men buni allaqachon bilib oldim. :) Lekin men bu savolni yaxshi dalil deb bilaman! O'ylaymanki, bizni o'qigan odamlar uchun bu foydali emas!
qo'shib qo'ydi muallif Sir Jo Black, manba
Hi @janm, men bu yerda ATmega168 yordamida algoritmni sinab ko'rmoqdaman va bir oz ajablantiradigan narsa bilan buni ko'rdim, althoug otpimization -O'q, siz unrolled algoritm MUL ko'rsatmasidan foydalanish bilan tuzilgan, n + + ((0 -! (! ((B & (1U << (shift))))) va (a (shift)) yordamida natija berildi;
qo'shib qo'ydi muallif Sir Jo Black, manba
Ha, xatoni tuzatdim! Bundan tashqari, kompilyator bir bit sinovdan foydalanishi va aniq filiallardan qochish uchun ko'rsatmalarni o'tkazib yuborishi mumkin bo'lgan alternativ bilan yangilanadi.
qo'shib qo'ydi muallif janm, manba
Sintaksischi nima qilayotganini ko'ring - agar siz filialga aylantirilsa, C kodini bekor qilsangiz, juda uzoqqa ketmoqdasiz. Bu sizning kompilyatoringiz qanchalik yaxshi ekanligiga juda bog'liq. Buyruqni ko'rib chiqsangiz, oldini olish kerak bo'lgan narsa, ehtimol a << shift va o'rniga a ni har bir qadamda qo'yib qo'yadi. Qarashlarimni oxiriga etkazish kerak deb o'ylayman. Derleyicinin, mening tartibga qo'shgan UMULT16_STEP so'lining ikkinchi versiyasi uchun bu assembler qanday chiqarishi mumkinligini istardim.
qo'shib qo'ydi muallif janm, manba
To'liq - menimcha, montajni eng yaxshi aks ettiradigan C - makroimning ikkinchi versiyasi (a) << 1 ifodasi. Ushbu makro xaritalardagi hamma narsa mashina ko'rsatmalariga o'tadi va filialsiz amalga oshirilishi mumkin. Savol shuki, derivat bir filial tashkil qiladimi yoki assemblerimda bo'lgani kabi ikkita sbrc ko'rsatmalarini ishlab chiqaradimi.
qo'shib qo'ydi muallif janm, manba
3 msec @ 1/16 msec sekningiz vaqti sizga 48 tsiklni beradi. Buning uchun siz 8x16 -> 16 bitni qanday olishingiz mumkinligini ko'ra olaman. O'ylaymanki, eng yaxshi 16 x 16 -> 16 uchun 96 tsikl (oxirgi lsl r2; rol r3 ) olib tashlanadi, chunki ular ishlatilmayapti, lekin natijani boshlash uchun ikkita xors kerak. ) Doimiy ravishda ko'payib ketish narsalarni o'zgartirishi mumkin, chunki siz testlarni olib tashlashingiz mumkin.
qo'shib qo'ydi muallif janm, manba
O'zaro kelishuv sifatida, operandlardan biri nol yuqori baytda ekanligini tekshirishingiz mumkin. Agar ha bo'lsa, u ishni bit-wise (qolgan sakkizta) uchun tekshiring. (Aks holda, mahsulot to'lib ketishi mumkin).
qo'shib qo'ydi muallif greybeard, manba

A non-answer, tinyARM assembler (web doc) instead of C++ or C. I modified a pretty generic multiply-by-squares-lookup for speed (< 50 cycles excluding call&return overhead) at the cost of only fitting into AVRs with no less than 1KByte of RAM, using 512 aligned bytes for a table of the lower half of squares. At 20 MHz, that would nicely meet the 2 max 3 usec time limit still not showing up in the question proper - but Sergio Formiggini wanted 16 MHz. As of 2015/04, there is just one ATtiny from Atmel with that much RAM, and that is specified up to 8 MHz … (Rolling your "own" (e.g., from OpenCores) your FPGA probably has a bunch of fast multipliers (18×18 bits seems popular), if not processor cores.)
For a stab at fast shift-and-add, have a look at shift and add, factor shifting left, unrolled 16×16→16 and/or improve on it (wiki post). (You might well create that community wiki answer begged for in the question.)

.def    a0  = r16   ; factor low byte
.def    a1  = r17
#warning two warnings about preceding definitions of
#warning  r16 and r17 are due and may as well be ignored
.def    a   = r16   ; 8-bit factor
.def    b   = r17   ; 8-bit factor ; or r18, rather?
.def    b0  = r18   ; factor low byte
.def    b1  = r19
.def    p0  = r20   ; product low byte
.def    p1  = r21

; "squares table" SqTab shall be two 512 Byte tables of
;  squares of 9-bit natural numbers, divided by 4

; Idea: exploit p = a * b = Squares[a+b] - Squares[a-b]

init:
    ldi     r16, 0x73
    ldi     r17, 0xab
    ldi     r18, 23
    ldi     r19, 1
    ldi     r20, HIGH(SRAM_SIZE)
    cpi     r20, 2
    brsh    fillSqTable ; ATtiny 1634?
    rjmp    mpy16T16
fillSqTable:
    ldi     r20, SqTabH
    subi    r20, -2
    ldi     zh, SqTabH
    clr     zl
; generate sqares by adding up odd numbers starting at 1 += -1
    ldi     r22, 1
    clr     r23
    ser     r26
    ser     r27
fillLoop:
    add     r22, r26
    adc     r23, r27
    adiw    r26, 2
    mov     r21, r23
    lsr     r21         ; get bits 9:2
    mov     r21, r22
    ror     r21
    lsr     r21
    bst     r23, 1
    bld     r21, 7
    st      z+, r21
    cp      zh, r20
    brne    fillLoop
    rjmp    mpy16F16

; assembly lines are marked up with cycle count
;  and (latest) start cycle in block.
; If first line in code block, the (latest) block start cycle
;  follows; else if last line, the (max) block cycle total

;**************************************************************
;*
;* "mpy16F16" - 16x16->16 Bit Unsigned Multiplication
;*                        using table lookup
;* Sergio Formiggini special edition
;* Multiplies  two 16-bit register values a1:a0 and b1:b0.
;* The result is placed in p1:p0.
;*
;* Number of flash words: 318 + return = 
;*                       (40 + 256(flash table) + 22(RAM init))
;* Number of cycles     : 49 + return
;* Low  registers used  : None
;* High registers used  : 7+2 (a1:a0, b1:b0, p1:p0, sq;
;*                             + Z(r31:r30))
;* RAM bytes used       : 512 (squares table)
;*
;**************************************************************
mpy16F16:
    ldi     ZH, SqTabH>>1;1 0   0   squares table>>1
    mov     ZL, a0      ; 1 1
    add     ZL, b0      ; 1 2       a0+b0
    rol     ZH          ; 1 3       9 bit offset
    ld      p0, Z       ; 2 4       a0+b0l          1
    lpm     p1, Z       ; 3 6   9   a0+b0h          2

    ldi     ZH, SqTabH  ; 1 0   9   squares table

    mov     ZL, a1      ; 1 0   10
    sub     ZL, b0      ; 1 1       a1-b0
    brcc    noNegF10    ; 1 2
    neg     ZL          ; 1 3
noNegF10:
    ld      sq, Z       ; 2 4       a1-b0l          3
    sub     p1, sq      ; 1 6   7

    mov     ZL, a0      ; 1 0   17
    sub     ZL, b1      ; 1 1       a0-b1
    brcc    noNegF01    ; 1 2
    neg     ZL          ; 1 3
noNegF01:
    ld      sq, Z       ; 2 4       a0-b1l          4
    sub     p1, sq      ; 1 6   7

    mov     ZL, a0      ; 1 0   24
    sub     ZL, b0      ; 1 1       a0-b0
    brcc    noNegF00    ; 1 2
    neg     ZL          ; 1 3
noNegF00:
    ld      sq, Z       ; 2 4       a0-b0l          5
    sub     p0, sq      ; 1 6
    lpm     sq, Z       ; 3 7       a0-b0h          6*
    sbc     p1, sq      ; 1 10  11

    ldi     ZH, SqTabH>>1;1 0   35
    mov     ZL, a1      ; 1 1
    add     ZL, b0      ; 1 2       a1+b0
    rol     ZH          ; 1 3
    ld      sq, Z       ; 2 4       a1+b0l          7
    add     p1, sq      ; 1 6   7

    ldi     ZH, SqTabH>>1;1 0   42
    mov     ZL, a0      ; 1 1
    add     ZL, b1      ; 1 2       a0+b1
    rol     ZH          ; 1 3
    ld      sq, Z       ; 2 4       a0+b1l          8
    add     p1, sq      ; 1 6   7

    ret                 ;       49

.CSEG
.org 256; words?!
SqTableH:
.db   0,   0,   0,   0,   0,   0,   0,   0,   0,   0
.db   0,   0,   0,   0,   0,   0,   0,   0,   0,   0
.db   0,   0,   0,   0,   0,   0,   0,   0,   0,   0
.db   0,   0,   1,   1,   1,   1,   1,   1,   1,   1
.db   1,   1,   1,   1,   1,   1,   2,   2,   2,   2
.db   2,   2,   2,   2,   2,   2,   3,   3,   3,   3
.db   3,   3,   3,   3,   4,   4,   4,   4,   4,   4
.db   4,   4,   5,   5,   5,   5,   5,   5,   5,   6
.db   6,   6,   6,   6,   6,   7,   7,   7,   7,   7
.db   7,   8,   8,   8,   8,   8,   9,   9,   9,   9
.db   9,   9,  10,  10,  10,  10,  10,  11,  11,  11
.db  11,  12,  12,  12,  12,  12,  13,  13,  13,  13
.db  14,  14,  14,  14,  15,  15,  15,  15,  16,  16
.db  16,  16,  17,  17,  17,  17,  18,  18,  18,  18
.db  19,  19,  19,  19,  20,  20,  20,  21,  21,  21
.db  21,  22,  22,  22,  23,  23,  23,  24,  24,  24
.db  25,  25,  25,  25,  26,  26,  26,  27,  27,  27
.db  28,  28,  28,  29,  29,  29,  30,  30,  30,  31
.db  31,  31,  32,  32,  33,  33,  33,  34,  34,  34
.db  35,  35,  36,  36,  36,  37,  37,  37,  38,  38
.db  39,  39,  39,  40,  40,  41,  41,  41,  42,  42
.db  43,  43,  43,  44,  44,  45,  45,  45,  46,  46
.db  47,  47,  48,  48,  49,  49,  49,  50,  50,  51
.db  51,  52,  52,  53,  53,  53,  54,  54,  55,  55
.db  56,  56,  57,  57,  58,  58,  59,  59,  60,  60
.db  61,  61,  62,  62,  63,  63,  64,  64,  65,  65
.db  66,  66,  67,  67,  68,  68,  69,  69,  70,  70
.db  71,  71,  72,  72,  73,  73,  74,  74,  75,  76
.db  76,  77,  77,  78,  78,  79,  79,  80,  81,  81
.db  82,  82,  83,  83,  84,  84,  85,  86,  86,  87
.db  87,  88,  89,  89,  90,  90,  91,  92,  92,  93
.db  93,  94,  95,  95,  96,  96,  97,  98,  98,  99
.db 100, 100, 101, 101, 102, 103, 103, 104, 105, 105
.db 106, 106, 107, 108, 108, 109, 110, 110, 111, 112
.db 112, 113, 114, 114, 115, 116, 116, 117, 118, 118
.db 119, 120, 121, 121, 122, 123, 123, 124, 125, 125
.db 126, 127, 127, 128, 129, 130, 130, 131, 132, 132
.db 133, 134, 135, 135, 136, 137, 138, 138, 139, 140
.db 141, 141, 142, 143, 144, 144, 145, 146, 147, 147
.db 148, 149, 150, 150, 151, 152, 153, 153, 154, 155
.db 156, 157, 157, 158, 159, 160, 160, 161, 162, 163
.db 164, 164, 165, 166, 167, 168, 169, 169, 170, 171
.db 172, 173, 173, 174, 175, 176, 177, 178, 178, 179
.db 180, 181, 182, 183, 183, 184, 185, 186, 187, 188
.db 189, 189, 190, 191, 192, 193, 194, 195, 196, 196
.db 197, 198, 199, 200, 201, 202, 203, 203, 204, 205
.db 206, 207, 208, 209, 210, 211, 212, 212, 213, 214
.db 215, 216, 217, 218, 219, 220, 221, 222, 223, 224
.db 225, 225, 226, 227, 228, 229, 230, 231, 232, 233
.db 234, 235, 236, 237, 238, 239, 240, 241, 242, 243
.db 244, 245, 246, 247, 248, 249, 250, 251, 252, 253
.db 254, 255
; word addresses, again?!
.equ SqTabH = (high(SqTableH) << 1)

.DSEG
RAMTab .BYTE 512
1
qo'shib qo'ydi

A non-answer, tinyARM assembler (web doc) instead of C++ or C. I modified a pretty generic multiply-by-squares-lookup for speed (< 50 cycles excluding call&return overhead) at the cost of only fitting into AVRs with no less than 1KByte of RAM, using 512 aligned bytes for a table of the lower half of squares. At 20 MHz, that would nicely meet the 2 max 3 usec time limit still not showing up in the question proper - but Sergio Formiggini wanted 16 MHz. As of 2015/04, there is just one ATtiny from Atmel with that much RAM, and that is specified up to 8 MHz … (Rolling your "own" (e.g., from OpenCores) your FPGA probably has a bunch of fast multipliers (18×18 bits seems popular), if not processor cores.)
For a stab at fast shift-and-add, have a look at shift and add, factor shifting left, unrolled 16×16→16 and/or improve on it (wiki post). (You might well create that community wiki answer begged for in the question.)

.def    a0  = r16   ; factor low byte
.def    a1  = r17
#warning two warnings about preceding definitions of
#warning  r16 and r17 are due and may as well be ignored
.def    a   = r16   ; 8-bit factor
.def    b   = r17   ; 8-bit factor ; or r18, rather?
.def    b0  = r18   ; factor low byte
.def    b1  = r19
.def    p0  = r20   ; product low byte
.def    p1  = r21

; "squares table" SqTab shall be two 512 Byte tables of
;  squares of 9-bit natural numbers, divided by 4

; Idea: exploit p = a * b = Squares[a+b] - Squares[a-b]

init:
    ldi     r16, 0x73
    ldi     r17, 0xab
    ldi     r18, 23
    ldi     r19, 1
    ldi     r20, HIGH(SRAM_SIZE)
    cpi     r20, 2
    brsh    fillSqTable ; ATtiny 1634?
    rjmp    mpy16T16
fillSqTable:
    ldi     r20, SqTabH
    subi    r20, -2
    ldi     zh, SqTabH
    clr     zl
; generate sqares by adding up odd numbers starting at 1 += -1
    ldi     r22, 1
    clr     r23
    ser     r26
    ser     r27
fillLoop:
    add     r22, r26
    adc     r23, r27
    adiw    r26, 2
    mov     r21, r23
    lsr     r21         ; get bits 9:2
    mov     r21, r22
    ror     r21
    lsr     r21
    bst     r23, 1
    bld     r21, 7
    st      z+, r21
    cp      zh, r20
    brne    fillLoop
    rjmp    mpy16F16

; assembly lines are marked up with cycle count
;  and (latest) start cycle in block.
; If first line in code block, the (latest) block start cycle
;  follows; else if last line, the (max) block cycle total

;**************************************************************
;*
;* "mpy16F16" - 16x16->16 Bit Unsigned Multiplication
;*                        using table lookup
;* Sergio Formiggini special edition
;* Multiplies  two 16-bit register values a1:a0 and b1:b0.
;* The result is placed in p1:p0.
;*
;* Number of flash words: 318 + return = 
;*                       (40 + 256(flash table) + 22(RAM init))
;* Number of cycles     : 49 + return
;* Low  registers used  : None
;* High registers used  : 7+2 (a1:a0, b1:b0, p1:p0, sq;
;*                             + Z(r31:r30))
;* RAM bytes used       : 512 (squares table)
;*
;**************************************************************
mpy16F16:
    ldi     ZH, SqTabH>>1;1 0   0   squares table>>1
    mov     ZL, a0      ; 1 1
    add     ZL, b0      ; 1 2       a0+b0
    rol     ZH          ; 1 3       9 bit offset
    ld      p0, Z       ; 2 4       a0+b0l          1
    lpm     p1, Z       ; 3 6   9   a0+b0h          2

    ldi     ZH, SqTabH  ; 1 0   9   squares table

    mov     ZL, a1      ; 1 0   10
    sub     ZL, b0      ; 1 1       a1-b0
    brcc    noNegF10    ; 1 2
    neg     ZL          ; 1 3
noNegF10:
    ld      sq, Z       ; 2 4       a1-b0l          3
    sub     p1, sq      ; 1 6   7

    mov     ZL, a0      ; 1 0   17
    sub     ZL, b1      ; 1 1       a0-b1
    brcc    noNegF01    ; 1 2
    neg     ZL          ; 1 3
noNegF01:
    ld      sq, Z       ; 2 4       a0-b1l          4
    sub     p1, sq      ; 1 6   7

    mov     ZL, a0      ; 1 0   24
    sub     ZL, b0      ; 1 1       a0-b0
    brcc    noNegF00    ; 1 2
    neg     ZL          ; 1 3
noNegF00:
    ld      sq, Z       ; 2 4       a0-b0l          5
    sub     p0, sq      ; 1 6
    lpm     sq, Z       ; 3 7       a0-b0h          6*
    sbc     p1, sq      ; 1 10  11

    ldi     ZH, SqTabH>>1;1 0   35
    mov     ZL, a1      ; 1 1
    add     ZL, b0      ; 1 2       a1+b0
    rol     ZH          ; 1 3
    ld      sq, Z       ; 2 4       a1+b0l          7
    add     p1, sq      ; 1 6   7

    ldi     ZH, SqTabH>>1;1 0   42
    mov     ZL, a0      ; 1 1
    add     ZL, b1      ; 1 2       a0+b1
    rol     ZH          ; 1 3
    ld      sq, Z       ; 2 4       a0+b1l          8
    add     p1, sq      ; 1 6   7

    ret                 ;       49

.CSEG
.org 256; words?!
SqTableH:
.db   0,   0,   0,   0,   0,   0,   0,   0,   0,   0
.db   0,   0,   0,   0,   0,   0,   0,   0,   0,   0
.db   0,   0,   0,   0,   0,   0,   0,   0,   0,   0
.db   0,   0,   1,   1,   1,   1,   1,   1,   1,   1
.db   1,   1,   1,   1,   1,   1,   2,   2,   2,   2
.db   2,   2,   2,   2,   2,   2,   3,   3,   3,   3
.db   3,   3,   3,   3,   4,   4,   4,   4,   4,   4
.db   4,   4,   5,   5,   5,   5,   5,   5,   5,   6
.db   6,   6,   6,   6,   6,   7,   7,   7,   7,   7
.db   7,   8,   8,   8,   8,   8,   9,   9,   9,   9
.db   9,   9,  10,  10,  10,  10,  10,  11,  11,  11
.db  11,  12,  12,  12,  12,  12,  13,  13,  13,  13
.db  14,  14,  14,  14,  15,  15,  15,  15,  16,  16
.db  16,  16,  17,  17,  17,  17,  18,  18,  18,  18
.db  19,  19,  19,  19,  20,  20,  20,  21,  21,  21
.db  21,  22,  22,  22,  23,  23,  23,  24,  24,  24
.db  25,  25,  25,  25,  26,  26,  26,  27,  27,  27
.db  28,  28,  28,  29,  29,  29,  30,  30,  30,  31
.db  31,  31,  32,  32,  33,  33,  33,  34,  34,  34
.db  35,  35,  36,  36,  36,  37,  37,  37,  38,  38
.db  39,  39,  39,  40,  40,  41,  41,  41,  42,  42
.db  43,  43,  43,  44,  44,  45,  45,  45,  46,  46
.db  47,  47,  48,  48,  49,  49,  49,  50,  50,  51
.db  51,  52,  52,  53,  53,  53,  54,  54,  55,  55
.db  56,  56,  57,  57,  58,  58,  59,  59,  60,  60
.db  61,  61,  62,  62,  63,  63,  64,  64,  65,  65
.db  66,  66,  67,  67,  68,  68,  69,  69,  70,  70
.db  71,  71,  72,  72,  73,  73,  74,  74,  75,  76
.db  76,  77,  77,  78,  78,  79,  79,  80,  81,  81
.db  82,  82,  83,  83,  84,  84,  85,  86,  86,  87
.db  87,  88,  89,  89,  90,  90,  91,  92,  92,  93
.db  93,  94,  95,  95,  96,  96,  97,  98,  98,  99
.db 100, 100, 101, 101, 102, 103, 103, 104, 105, 105
.db 106, 106, 107, 108, 108, 109, 110, 110, 111, 112
.db 112, 113, 114, 114, 115, 116, 116, 117, 118, 118
.db 119, 120, 121, 121, 122, 123, 123, 124, 125, 125
.db 126, 127, 127, 128, 129, 130, 130, 131, 132, 132
.db 133, 134, 135, 135, 136, 137, 138, 138, 139, 140
.db 141, 141, 142, 143, 144, 144, 145, 146, 147, 147
.db 148, 149, 150, 150, 151, 152, 153, 153, 154, 155
.db 156, 157, 157, 158, 159, 160, 160, 161, 162, 163
.db 164, 164, 165, 166, 167, 168, 169, 169, 170, 171
.db 172, 173, 173, 174, 175, 176, 177, 178, 178, 179
.db 180, 181, 182, 183, 183, 184, 185, 186, 187, 188
.db 189, 189, 190, 191, 192, 193, 194, 195, 196, 196
.db 197, 198, 199, 200, 201, 202, 203, 203, 204, 205
.db 206, 207, 208, 209, 210, 211, 212, 212, 213, 214
.db 215, 216, 217, 218, 219, 220, 221, 222, 223, 224
.db 225, 225, 226, 227, 228, 229, 230, 231, 232, 233
.db 234, 235, 236, 237, 238, 239, 240, 241, 242, 243
.db 244, 245, 246, 247, 248, 249, 250, 251, 252, 253
.db 254, 255
; word addresses, again?!
.equ SqTabH = (high(SqTableH) << 1)

.DSEG
RAMTab .BYTE 512
1
qo'shib qo'ydi

Uzoq vaqt o'tgach, agar javobsiz bo'lsa, javob bering: GCC-dan AVR-C-kompilyatorini 8K kodiga moslasha olmadim (hali). (Assotsiatsiyaga oid ma'lumotlar uchun AVR ko'paytmasi: yo'qligini bekor qilmaydi ). Yondashuv Duff qurilmasini ishlatgan har bir kishi ikkinchi urinish uchun harakat qildi:
kalitni ishlating. Makrolardan foydalansa, massaj qilinganida manba kodi ko'rinadi butunlay xavfsiz emas:

#define low(mp)     case mp: p = a0 * (uint8_t)(mp) << 8; break
#define low4(mp)    low(mp); low(mp + 1); low(mp + 2); low(mp + 3)
#define low16(mp)   low4(mp); low4(mp + 4); low4(mp + 8); low4(mp + 12)
#define low64(mp)   low16(mp); low16(mp + 16); low16(mp + 32); low16(mp + 48)
#if preShift
# define CASE(mp)   case mp: return p + a * (mp)
#else
# define CASE(mp)   case mp: return (p0<<8) + a * (mp)
#endif
#define case4(mp)   CASE(mp); CASE(mp + 1); CASE(mp + 2); CASE(mp + 3)
#define case16(mp)  case4(mp); case4(mp + 4); case4(mp + 8); case4(mp + 12)
#define case64(mp)  case16(mp); case16(mp + 16); case16(mp + 32); case16(mp + 48)

extern "C" __attribute__ ((noinline))
 uint16_t mpy16NHB16(uint16_t a, uint16_t b)
{
    uint16_t p = 0;
    uint8_t b0 = (uint8_t)b, b1 = (uint8_t)(b>>8);
    uint8_t a0 = (uint8_t)a, p0;

    switch (b1) {
        case64(0);
        case64(64);
        case64(128);
        case64(192);
    }
#if preShift
    p = p0 << 8;
#endif
#if preliminaries
    if (0 == b0) {
        p = -a;
        if (b & 0x8000)
            p += a << 9;
        if (b & 0x4000)
            p += a << 8;
        return p;
    }
    while (b0 & 1) {
        a <<= 1;
        b0 >>= 1;
    }
#endif
    switch (b0) {
        low64(0);
        low64(64);
        low64(128);
        low64(192);
    }
    return ~0;
}
int main(int ac, char const *const av[])
{
    char buf[22];
    for (uint16_t a = 0 ; a < a+1 ; a++)
      for (uint16_t m = 0 ; m <= a ; m++)
        puts(itoa(//shift4(ac)+shift3MaskAdd((uint16_t)av[0], ac)
   //     +shift4Add(ac, (uint16_t)av[0])
   //          + mpy16NHB16(ac, (ac + 105))
                 mpy16NHB16(a, m), buf, 10));
}
0
qo'shib qo'ydi

Uzoq vaqt o'tgach, agar javobsiz bo'lsa, javob bering: GCC-dan AVR-C-kompilyatorini 8K kodiga moslasha olmadim (hali). (Assotsiatsiyaga oid ma'lumotlar uchun AVR ko'paytmasi: yo'qligini bekor qilmaydi ). Yondashuv Duff qurilmasini ishlatgan har bir kishi ikkinchi urinish uchun harakat qildi:
kalitni ishlating. Makrolardan foydalansa, massaj qilinganida manba kodi ko'rinadi butunlay xavfsiz emas:

#define low(mp)     case mp: p = a0 * (uint8_t)(mp) << 8; break
#define low4(mp)    low(mp); low(mp + 1); low(mp + 2); low(mp + 3)
#define low16(mp)   low4(mp); low4(mp + 4); low4(mp + 8); low4(mp + 12)
#define low64(mp)   low16(mp); low16(mp + 16); low16(mp + 32); low16(mp + 48)
#if preShift
# define CASE(mp)   case mp: return p + a * (mp)
#else
# define CASE(mp)   case mp: return (p0<<8) + a * (mp)
#endif
#define case4(mp)   CASE(mp); CASE(mp + 1); CASE(mp + 2); CASE(mp + 3)
#define case16(mp)  case4(mp); case4(mp + 4); case4(mp + 8); case4(mp + 12)
#define case64(mp)  case16(mp); case16(mp + 16); case16(mp + 32); case16(mp + 48)

extern "C" __attribute__ ((noinline))
 uint16_t mpy16NHB16(uint16_t a, uint16_t b)
{
    uint16_t p = 0;
    uint8_t b0 = (uint8_t)b, b1 = (uint8_t)(b>>8);
    uint8_t a0 = (uint8_t)a, p0;

    switch (b1) {
        case64(0);
        case64(64);
        case64(128);
        case64(192);
    }
#if preShift
    p = p0 << 8;
#endif
#if preliminaries
    if (0 == b0) {
        p = -a;
        if (b & 0x8000)
            p += a << 9;
        if (b & 0x4000)
            p += a << 8;
        return p;
    }
    while (b0 & 1) {
        a <<= 1;
        b0 >>= 1;
    }
#endif
    switch (b0) {
        low64(0);
        low64(64);
        low64(128);
        low64(192);
    }
    return ~0;
}
int main(int ac, char const *const av[])
{
    char buf[22];
    for (uint16_t a = 0 ; a < a+1 ; a++)
      for (uint16_t m = 0 ; m <= a ; m++)
        puts(itoa(//shift4(ac)+shift3MaskAdd((uint16_t)av[0], ac)
   //     +shift4Add(ac, (uint16_t)av[0])
   //          + mpy16NHB16(ac, (ac + 105))
                 mpy16NHB16(a, m), buf, 10));
}
0
qo'shib qo'ydi