// -*- coding: utf-8 -*- // // Copyright 2022 Michael Büsch // // Licensed under the Apache License version 2.0 // or the MIT license, at your option. // SPDX-License-Identifier: Apache-2.0 OR MIT // #include #include #define INITIAL_STATE 424242 static volatile uint8_t result = 0; static volatile uint32_t state_c = INITIAL_STATE; static volatile uint32_t state_asm_simple = INITIAL_STATE; static volatile uint32_t state_asm_rshift = INITIAL_STATE; static volatile uint32_t state_asm_rshift_unroll = INITIAL_STATE; static volatile uint32_t state_asm_mul = INITIAL_STATE; uint32_t shr3_c(uint32_t y) __attribute__((__noinline__)); uint32_t shr3_c(uint32_t y) { y ^= y << 13; y ^= y >> 17; y ^= y << 5; return y; } // Cycles: 35 + 5 + 41 = 81 uint32_t shr3_asm_simple(uint32_t y) __attribute__((__noinline__)); uint32_t shr3_asm_simple(uint32_t y) { uint32_t tmp; uint8_t i; __asm__ __volatile__( // y ^= y << 13 // Cycles: 3 + (6 * 4) + (5 * 1) + 3 = 35 " movw %A1, %A0 \n" // mov ab to temp " mov %C1, %C0 \n" // mov c to temp " ldi %2, 5 \n" // shift loop counter "1: lsl %A1 \n" // temp_a <<= 1 " rol %B1 \n" // temp_b <<= 1 " rol %C1 \n" // temp_c <<= 1 " dec %2 \n" // loop counter -= 1 " brne 1b \n" // if (loop counter > 0) goto 1 " eor %B0, %A1 \n" // b ^= temp_a << 8 " eor %C0, %B1 \n" // c ^= temp_b << 8 " eor %D0, %C1 \n" // d ^= temp_c << 8 // y ^= y >> 17 // Cycles: 5 " movw %C1, %C0 \n" // mov cd to temp " lsr %D1 \n" // temp_d >>= 1 " ror %C1 \n" // temp_c >>= 1 " eor %A0, %C1 \n" // a ^= temp_c >> 16 " eor %B0, %D1 \n" // b ^= temp_d >> 16 // y ^= y << 5 // Cycles: 3 + (7 * 4) + (6 * 1) + 4 = 41 " movw %A1, %A0 \n" // mov ab to temp " movw %C1, %C0 \n" // mov cd to temp " ldi %2, 5 \n" // shift loop counter "1: lsl %A1 \n" // temp_a <<= 1 " rol %B1 \n" // temp_b <<= 1 " rol %C1 \n" // temp_c <<= 1 " rol %D1 \n" // temp_d <<= 1 " dec %2 \n" // loop counter -= 1 " brne 1b \n" // if (loop counter > 0) goto 1 " eor %A0, %A1 \n" // a ^= temp_a " eor %B0, %B1 \n" // b ^= temp_b " eor %C0, %C1 \n" // c ^= temp_c " eor %D0, %D1 \n" // d ^= temp_d : "=d" (y), // 0 "=d" (tmp), // 1 "=d" (i) // 2 : "0" (y) ); return y; } // Cycles: 23 + 17 + 27 = 67 uint32_t shr3_asm_rshift(uint32_t y) __attribute__((__noinline__)); uint32_t shr3_asm_rshift(uint32_t y) { uint32_t tmp32; uint8_t tmp8; __asm__ __volatile__( // y ^= y << 13 // Cycles: 3 + (6 * 2) + (5 * 1) + 3 = 23 " ldi %2, 3 \n" // shift loop end condition and carry " movw %A1, %A0 \n" // mov ab to temp " mov %C1, %C0 \n" // mov c to temp "1: lsr %C1 \n" // c >>= 1 " ror %B1 \n" // b >>= 1 " ror %A1 \n" // a >>= 1 " ror %2 \n" // condition >>= 1 " brcs 1b \n" // if (condition was nonzero) goto 1 " eor %B0, %2 \n" // b ^= carry " eor %C0, %A1 \n" // c ^= a_temp " eor %D0, %B1 \n" // d ^= b_temp // y ^= y >> 17 // Cycles: 5 " movw %C1, %C0 \n" // mov cd to temp " lsr %D1 \n" // temp_d >>= 1 " ror %C1 \n" // temp_c >>= 1 " eor %A0, %C1 \n" // a ^= temp_c >> 16 " eor %B0, %D1 \n" // b ^= temp_d >> 16 // y ^= y << 5 // Cycles: 3 + (7 * 2) + (6 * 1) + 4 = 27 " ldi %2, 3 \n" // shift loop end condition and carry " movw %A1, %A0 \n" // mov ab to temp " movw %C1, %C0 \n" // mov cd to temp "1: lsr %D1 \n" // d >>= 1 " ror %C1 \n" // c >>= 1 " ror %B1 \n" // b >>= 1 " ror %A1 \n" // a >>= 1 " ror %2 \n" // condition >>= 1 " brcs 1b \n" // if (condition was nonzero) goto 1 " eor %A0, %2 \n" // a ^= carry " eor %B0, %A1 \n" // b ^= a_temp " eor %C0, %B1 \n" // c ^= b_temp " eor %D0, %C1 \n" // d ^= c_temp : "=d" (y), // 0 "=d" (tmp32), // 1 "=d" (tmp8) // 2 : "0" (y) ); return y; } // Cycles: 18 + 17 + 22 = 57 uint32_t shr3_asm_rshift_unroll(uint32_t y) __attribute__((__noinline__)); uint32_t shr3_asm_rshift_unroll(uint32_t y) { uint32_t tmp32; uint8_t tmp8; __asm__ __volatile__( // y ^= y << 13 // Cycles: 18 * 1 = 18 " clr %2 \n" // clear carry " movw %A1, %A0 \n" // mov ab to temp " mov %C1, %C0 \n" // mov c to temp " lsr %C1 \n" // c >>= 1 " ror %B1 \n" // b >>= 1 " ror %A1 \n" // a >>= 1 " ror %2 \n" // carry >>= 1 " lsr %C1 \n" // c >>= 1 " ror %B1 \n" // b >>= 1 " ror %A1 \n" // a >>= 1 " ror %2 \n" // carry >>= 1 " lsr %C1 \n" // c >>= 1 " ror %B1 \n" // b >>= 1 " ror %A1 \n" // a >>= 1 " ror %2 \n" // carry >>= 1 " eor %B0, %2 \n" // b ^= carry " eor %C0, %A1 \n" // c ^= a_temp " eor %D0, %B1 \n" // d ^= b_temp // y ^= y >> 17 // Cycles: 5 " movw %C1, %C0 \n" // mov cd to temp " lsr %D1 \n" // temp_d >>= 1 " ror %C1 \n" // temp_c >>= 1 " eor %A0, %C1 \n" // a ^= temp_c >> 16 " eor %B0, %D1 \n" // b ^= temp_d >> 16 // y ^= y << 5 // Cycles: 22 * 1 = 22 " clr %2 \n" // clear carry " movw %A1, %A0 \n" // mov ab to temp " movw %C1, %C0 \n" // mov cd to temp " lsr %D1 \n" // d >>= 1 " ror %C1 \n" // c >>= 1 " ror %B1 \n" // b >>= 1 " ror %A1 \n" // a >>= 1 " ror %2 \n" // carry >>= 1 " lsr %D1 \n" // d >>= 1 " ror %C1 \n" // c >>= 1 " ror %B1 \n" // b >>= 1 " ror %A1 \n" // a >>= 1 " ror %2 \n" // carry >>= 1 " lsr %D1 \n" // d >>= 1 " ror %C1 \n" // c >>= 1 " ror %B1 \n" // b >>= 1 " ror %A1 \n" // a >>= 1 " ror %2 \n" // carry >>= 1 " eor %A0, %2 \n" // a ^= carry " eor %B0, %A1 \n" // b ^= a_temp " eor %C0, %B1 \n" // c ^= b_temp " eor %D0, %C1 \n" // d ^= c_temp : "=d" (y), // 0 "=d" (tmp32), // 1 "=d" (tmp8) // 2 : "0" (y) ); return y; } // Cycles: 1 + 15 + 5 + 20 + 1 = 42 uint32_t shr3_asm_mul(uint32_t y) __attribute__((__noinline__)); uint32_t shr3_asm_mul(uint32_t y) { uint32_t tmp; uint8_t fac; __asm__ __volatile__( // Setup // Cycles: 1 " ldi %2, 32 \n" // mul factor // y ^= y << 13 // Cycles: 1 + 1 + 2 + 1 + 2 + (2 * 1) + 2 + (4 * 1) = 15 " movw %A1, %A0 \n" // mov ab to temp " mov %C1, %C0 \n" // mov c to temp " mul %C1, %2 \n" // r0:r1 = c << 5 " mov %C1, r0 \n" // temp_c[7:5] = low 3 bits result " mul %B1, %2 \n" // r0:r1 = b << 5 " mov %B1, r0 \n" // temp_b[7:5] = low 3 bits result " or %C1, r1 \n" // temp_c[4:0] = high 5 bits result " mul %A1, %2 \n" // r0:r1 = a << 5 " or %B1, r1 \n" // temp_b[4:0] = high 5 bits result " eor %B0, r0 \n" // b[7:5] ^= low 3 bits result " eor %C0, %B1 \n" // c ^= temp_b << 8 " eor %D0, %C1 \n" // d ^= temp_c << 8 // y ^= y >> 17 // Cycles: 5 " movw %C1, %C0 \n" // mov cd to temp " lsr %D1 \n" // temp_d >>= 1 " ror %C1 \n" // temp_c >>= 1 " eor %A0, %C1 \n" // a ^= temp_c >> 16 " eor %B0, %D1 \n" // b ^= temp_d >> 16 // y ^= y << 5 // Cycles: (2 * 1) + 2 + 1 + 2 + (2 * 1) + 2 + (2 * 1) + 2 + (5 * 1) = 20 " movw %A1, %A0 \n" // mov ab to temp " movw %C1, %C0 \n" // mov cd to temp " mul %D1, %2 \n" // r0:r1 = d << 5 " mov %D1, r0 \n" // temp_d[7:5] = low 3 bits result " mul %C1, %2 \n" // r0:r1 = c << 5 " mov %C1, r0 \n" // temp_c[7:5] = low 3 bits result " or %D1, r1 \n" // temp_d[4:0] = high 5 bits result " mul %B1, %2 \n" // r0:r1 = b << 5 " mov %B1, r0 \n" // temp_b[7:5] = low 3 bits result " or %C1, r1 \n" // temp_c[4:0] = high 5 bits result " mul %A1, %2 \n" // r0:r1 = a << 5 " or %B1, r1 \n" // temp_b[4:0] = high 5 bits result " eor %A0, r0 \n" // a[7:5] ^= low 3 bits result " eor %B0, %B1 \n" // b ^= temp_b " eor %C0, %C1 \n" // c ^= temp_c " eor %D0, %D1 \n" // d ^= temp_d // Cleanup // Cycles: 1 " clr r1 \n" // restore r0 : "=d" (y), // 0 "=d" (tmp), // 1 "=d" (fac) // 2 : "0" (y) : "r0", "r1" ); return y; } int main(void) { result = 1; // ok while (1) { state_c = shr3_c(state_c); state_asm_simple = shr3_asm_simple(state_asm_simple); state_asm_rshift = shr3_asm_rshift(state_asm_rshift); state_asm_rshift_unroll = shr3_asm_rshift_unroll(state_asm_rshift_unroll); state_asm_mul = shr3_asm_mul(state_asm_mul); if (state_c != state_asm_simple || state_c != state_asm_rshift || state_c != state_asm_rshift_unroll || state_c != state_asm_mul) result = 2; // error DDRB |= 3; if (result == 1) { PORTB |= 1; // ok PORTB &= (uint8_t)~2; } else { PORTB &= (uint8_t)~1; PORTB |= 2; // error } } }