// -*- coding: utf-8 -*- // // Copyright 2022 Michael Büsch // // Licensed under the Apache License version 2.0 // or the MIT license, at your option. // SPDX-License-Identifier: Apache-2.0 OR MIT // use core::arch::asm; #[cfg_attr(feature="__devmode__", inline(never))] #[cfg_attr(not(feature="__devmode__"), inline(always))] pub fn shr3(state: u32) -> u32 { // Split state to 16 bit pairs. The compiler optimizes this away. let mut ab = state as u16; let mut cd = (state >> 16) as u16; unsafe { // Cycles: 2 + 15 + 5 + 20 + 2 = 44 asm!( // Setup // Cycles: 2 "mov {r0_save}, r0", // save r0 "ldi {fac}, 32", // mul factor // y ^= y << 13 // Cycles: 1 + 1 + 2 + 1 + 2 + (2 * 1) + 2 + (4 * 1) = 15 "movw {tab:l}:{tab:h}, {ab:l}:{ab:h}", // mov ab to temp "mov {tcd:l}, {cd:l}", // mov c to temp "mul {tcd:l}, {fac}", // r0:r1 = c << 5 "mov {tcd:l}, r0", // temp_c[7:5] = low 3 bits result "mul {tab:h}, {fac}", // r0:r1 = b << 5 "mov {tab:h}, r0", // temp_b[7:5] = low 3 bits result "or {tcd:l}, r1", // temp_c[4:0] = high 5 bits result "mul {tab:l}, {fac}", // r0:r1 = a << 5 "or {tab:h}, r1", // temp_b[4:0] = high 5 bits result "eor {ab:h}, r0", // b[7:5] ^= low 3 bits result "eor {cd:l}, {tab:h}", // c ^= temp_b << 8 "eor {cd:h}, {tcd:l}", // d ^= temp_c << 8 // y ^= y >> 17 // Cycles: 5 "movw {tcd:l}:{tcd:h}, {cd:l}:{cd:h}", // mov cd to temp "lsr {tcd:h}", // temp_d >>= 1 "ror {tcd:l}", // temp_c >>= 1 "eor {ab:l}, {tcd:l}", // a ^= temp_c >> 16 "eor {ab:h}, {tcd:h}", // b ^= temp_d >> 16 // y ^= y << 5 // Cycles: (2 * 1) + 2 + 1 + 2 + (2 * 1) + 2 + (2 * 1) + 2 + (5 * 1) = 20 "movw {tab:l}:{tab:h}, {ab:l}:{ab:h}", // mov ab to temp "movw {tcd:l}:{tcd:h}, {cd:l}:{cd:h}", // mov cd to temp "mul {tcd:h}, {fac}", // r0:r1 = d << 5 "mov {tcd:h}, r0", // temp_d[7:5] = low 3 bits result "mul {tcd:l}, {fac}", // r0:r1 = c << 5 "mov {tcd:l}, r0", // temp_c[7:5] = low 3 bits result "or {tcd:h}, r1", // temp_d[4:0] = high 5 bits result "mul {tab:h}, {fac}", // r0:r1 = b << 5 "mov {tab:h}, r0", // temp_b[7:5] = low 3 bits result "or {tcd:l}, r1", // temp_c[4:0] = high 5 bits result "mul {tab:l}, {fac}", // r0:r1 = a << 5 "or {tab:h}, r1", // temp_b[4:0] = high 5 bits result "eor {ab:l}, r0", // a[7:5] ^= low 3 bits result "eor {ab:h}, {tab:h}", // b ^= temp_b "eor {cd:l}, {tcd:l}", // c ^= temp_c "eor {cd:h}, {tcd:h}", // d ^= temp_d // Cleanup // Cycles: 2 "clr r1", // restore r1 "mov r0, {r0_save}", // restore r0 ab = inout(reg_pair) ab, // input/output byte a + b cd = inout(reg_pair) cd, // input/output byte c + d tab = out(reg_pair) _, // temporary byte a + b tcd = out(reg_pair) _, // temporary byte c + d fac = out(reg_upper) _, // mul factor r0_save = out(reg) _, // r0 restore options(pure, nomem, nostack), // We only access registers ); } // Combine the 16 bit state pairs. The compiler optimizes this away. (ab as u32) | ((cd as u32) << 16) } // vim: ts=4 sw=4 expandtab