// -*- coding: utf-8 -*-
//
// Copyright 2022 Michael Büsch <m@bues.ch>
//
// Licensed under the Apache License version 2.0
// or the MIT license, at your option.
// SPDX-License-Identifier: Apache-2.0 OR MIT
//

#include <stdint.h>
#include <avr/io.h>

#define INITIAL_STATE	424242

static volatile uint8_t result				= 0;
static volatile uint32_t state_c			= INITIAL_STATE;
static volatile uint32_t state_asm_simple		= INITIAL_STATE;
static volatile uint32_t state_asm_rshift		= INITIAL_STATE;
static volatile uint32_t state_asm_rshift_unroll	= INITIAL_STATE;
static volatile uint32_t state_asm_mul			= INITIAL_STATE;

uint32_t shr3_c(uint32_t y) __attribute__((__noinline__));
uint32_t shr3_c(uint32_t y)
{
	y ^= y << 13;
	y ^= y >> 17;
	y ^= y << 5;
	return y;
}

// Cycles: 35 + 5 + 41 = 81
uint32_t shr3_asm_simple(uint32_t y) __attribute__((__noinline__));
uint32_t shr3_asm_simple(uint32_t y)
{
	uint32_t tmp;
	uint8_t i;

	__asm__ __volatile__(
		// y ^= y << 13
		// Cycles: 3 + (6 * 4) + (5 * 1) + 3 = 35
"		movw	%A1, %A0	\n"	// mov ab to temp
"		mov	%C1, %C0	\n"	// mov c to temp
"		ldi	%2, 5		\n"	// shift loop counter
"1:		lsl	%A1		\n"	// temp_a <<= 1
"		rol	%B1		\n"	// temp_b <<= 1
"		rol	%C1		\n"	// temp_c <<= 1
"		dec	%2		\n"	// loop counter -= 1
"		brne	1b		\n"	// if (loop counter > 0) goto 1
"		eor	%B0, %A1	\n"	// b ^= temp_a << 8
"		eor	%C0, %B1	\n"	// c ^= temp_b << 8
"		eor	%D0, %C1	\n"	// d ^= temp_c << 8

		// y ^= y >> 17
		// Cycles: 5
"		movw	%C1, %C0	\n"	// mov cd to temp
"		lsr	%D1		\n"	// temp_d >>= 1
"		ror	%C1		\n"	// temp_c >>= 1
"		eor	%A0, %C1	\n"	// a ^= temp_c >> 16
"		eor	%B0, %D1	\n"	// b ^= temp_d >> 16

		// y ^= y << 5
		// Cycles: 3 + (7 * 4) + (6 * 1) + 4 = 41
"		movw	%A1, %A0	\n"	// mov ab to temp
"		movw	%C1, %C0	\n"	// mov cd to temp
"		ldi	%2, 5		\n"	// shift loop counter
"1:		lsl	%A1		\n"	// temp_a <<= 1
"		rol	%B1		\n"	// temp_b <<= 1
"		rol	%C1		\n"	// temp_c <<= 1
"		rol	%D1		\n"	// temp_d <<= 1
"		dec	%2		\n"	// loop counter -= 1
"		brne	1b		\n"	// if (loop counter > 0) goto 1
"		eor	%A0, %A1	\n"	// a ^= temp_a
"		eor	%B0, %B1	\n"	// b ^= temp_b
"		eor	%C0, %C1	\n"	// c ^= temp_c
"		eor	%D0, %D1	\n"	// d ^= temp_d

		: "=d" (y),	// 0
		  "=d" (tmp),	// 1
		  "=d" (i)	// 2
		: "0" (y)
	);

	return y;
}

// Cycles: 23 + 17 + 27 = 67
uint32_t shr3_asm_rshift(uint32_t y) __attribute__((__noinline__));
uint32_t shr3_asm_rshift(uint32_t y)
{
	uint32_t tmp32;
	uint8_t tmp8;

	__asm__ __volatile__(
		// y ^= y << 13
		// Cycles: 3 + (6 * 2) + (5 * 1) + 3 = 23
"		ldi	%2, 3		\n"	// shift loop end condition and carry
"		movw	%A1, %A0	\n"	// mov ab to temp
"		mov	%C1, %C0	\n"	// mov c to temp
"1:		lsr	%C1		\n"	// c >>= 1
"		ror	%B1		\n"	// b >>= 1
"		ror	%A1		\n"	// a >>= 1
"		ror	%2		\n"	// condition >>= 1
"		brcs	1b		\n"	// if (condition was nonzero) goto 1
"		eor	%B0, %2		\n"	// b ^= carry
"		eor	%C0, %A1	\n"	// c ^= a_temp
"		eor	%D0, %B1	\n"	// d ^= b_temp

		// y ^= y >> 17
		// Cycles: 5
"		movw	%C1, %C0	\n"	// mov cd to temp
"		lsr	%D1		\n"	// temp_d >>= 1
"		ror	%C1		\n"	// temp_c >>= 1
"		eor	%A0, %C1	\n"	// a ^= temp_c >> 16
"		eor	%B0, %D1	\n"	// b ^= temp_d >> 16

		// y ^= y << 5
		// Cycles: 3 + (7 * 2) + (6 * 1) + 4 = 27
"		ldi	%2, 3		\n"	// shift loop end condition and carry
"		movw	%A1, %A0	\n"	// mov ab to temp
"		movw	%C1, %C0	\n"	// mov cd to temp
"1:		lsr	%D1		\n"	// d >>= 1
"		ror	%C1		\n"	// c >>= 1
"		ror	%B1		\n"	// b >>= 1
"		ror	%A1		\n"	// a >>= 1
"		ror	%2		\n"	// condition >>= 1
"		brcs	1b		\n"	// if (condition was nonzero) goto 1
"		eor	%A0, %2		\n"	// a ^= carry
"		eor	%B0, %A1	\n"	// b ^= a_temp
"		eor	%C0, %B1	\n"	// c ^= b_temp
"		eor	%D0, %C1	\n"	// d ^= c_temp

		: "=d" (y),	// 0
		  "=d" (tmp32),	// 1
		  "=d" (tmp8)	// 2
		: "0" (y)
	);

	return y;
}

// Cycles: 18 + 17 + 22 = 57
uint32_t shr3_asm_rshift_unroll(uint32_t y) __attribute__((__noinline__));
uint32_t shr3_asm_rshift_unroll(uint32_t y)
{
	uint32_t tmp32;
	uint8_t tmp8;

	__asm__ __volatile__(
		// y ^= y << 13
		// Cycles: 18 * 1 = 18
"		clr	%2		\n"	// clear carry
"		movw	%A1, %A0	\n"	// mov ab to temp
"		mov	%C1, %C0	\n"	// mov c to temp
"		lsr	%C1		\n"	// c >>= 1
"		ror	%B1		\n"	// b >>= 1
"		ror	%A1		\n"	// a >>= 1
"		ror	%2		\n"	// carry >>= 1
"		lsr	%C1		\n"	// c >>= 1
"		ror	%B1		\n"	// b >>= 1
"		ror	%A1		\n"	// a >>= 1
"		ror	%2		\n"	// carry >>= 1
"		lsr	%C1		\n"	// c >>= 1
"		ror	%B1		\n"	// b >>= 1
"		ror	%A1		\n"	// a >>= 1
"		ror	%2		\n"	// carry >>= 1
"		eor	%B0, %2		\n"	// b ^= carry
"		eor	%C0, %A1	\n"	// c ^= a_temp
"		eor	%D0, %B1	\n"	// d ^= b_temp

		// y ^= y >> 17
		// Cycles: 5
"		movw	%C1, %C0	\n"	// mov cd to temp
"		lsr	%D1		\n"	// temp_d >>= 1
"		ror	%C1		\n"	// temp_c >>= 1
"		eor	%A0, %C1	\n"	// a ^= temp_c >> 16
"		eor	%B0, %D1	\n"	// b ^= temp_d >> 16

		// y ^= y << 5
		// Cycles: 22 * 1 = 22
"		clr	%2		\n"	// clear carry
"		movw	%A1, %A0	\n"	// mov ab to temp
"		movw	%C1, %C0	\n"	// mov cd to temp
"		lsr	%D1		\n"	// d >>= 1
"		ror	%C1		\n"	// c >>= 1
"		ror	%B1		\n"	// b >>= 1
"		ror	%A1		\n"	// a >>= 1
"		ror	%2		\n"	// carry >>= 1
"		lsr	%D1		\n"	// d >>= 1
"		ror	%C1		\n"	// c >>= 1
"		ror	%B1		\n"	// b >>= 1
"		ror	%A1		\n"	// a >>= 1
"		ror	%2		\n"	// carry >>= 1
"		lsr	%D1		\n"	// d >>= 1
"		ror	%C1		\n"	// c >>= 1
"		ror	%B1		\n"	// b >>= 1
"		ror	%A1		\n"	// a >>= 1
"		ror	%2		\n"	// carry >>= 1
"		eor	%A0, %2		\n"	// a ^= carry
"		eor	%B0, %A1	\n"	// b ^= a_temp
"		eor	%C0, %B1	\n"	// c ^= b_temp
"		eor	%D0, %C1	\n"	// d ^= c_temp

		: "=d" (y),	// 0
		  "=d" (tmp32),	// 1
		  "=d" (tmp8)	// 2
		: "0" (y)
	);

	return y;
}

// Cycles: 1 + 15 + 5 + 20 + 1 = 42
uint32_t shr3_asm_mul(uint32_t y) __attribute__((__noinline__));
uint32_t shr3_asm_mul(uint32_t y)
{
	uint32_t tmp;
	uint8_t fac;

	__asm__ __volatile__(
		// Setup
		// Cycles: 1
"		ldi	%2, 32		\n"	// mul factor

		// y ^= y << 13
		// Cycles: 1 + 1 + 2 + 1 + 2 + (2 * 1) + 2 + (4 * 1) = 15
"		movw	%A1, %A0	\n"	// mov ab to temp
"		mov	%C1, %C0	\n"	// mov c to temp
"		mul	%C1, %2		\n"	// r0:r1 = c << 5
"		mov	%C1, r0		\n"	// temp_c[7:5] = low 3 bits result
"		mul	%B1, %2		\n"	// r0:r1 = b << 5
"		mov	%B1, r0		\n"	// temp_b[7:5] = low 3 bits result
"		or	%C1, r1		\n"	// temp_c[4:0] = high 5 bits result
"		mul	%A1, %2		\n"	// r0:r1 = a << 5
"		or	%B1, r1		\n"	// temp_b[4:0] = high 5 bits result
"		eor	%B0, r0		\n"	// b[7:5] ^= low 3 bits result
"		eor	%C0, %B1	\n"	// c ^= temp_b << 8
"		eor	%D0, %C1	\n"	// d ^= temp_c << 8

		// y ^= y >> 17
		// Cycles: 5
"		movw	%C1, %C0	\n"	// mov cd to temp
"		lsr	%D1		\n"	// temp_d >>= 1
"		ror	%C1		\n"	// temp_c >>= 1
"		eor	%A0, %C1	\n"	// a ^= temp_c >> 16
"		eor	%B0, %D1	\n"	// b ^= temp_d >> 16

		// y ^= y << 5
		// Cycles: (2 * 1) + 2 + 1 + 2 + (2 * 1) + 2 + (2 * 1) + 2 + (5 * 1) = 20
"		movw	%A1, %A0	\n"	// mov ab to temp
"		movw	%C1, %C0	\n"	// mov cd to temp
"		mul	%D1, %2		\n"	// r0:r1 = d << 5
"		mov	%D1, r0		\n"	// temp_d[7:5] = low 3 bits result
"		mul	%C1, %2		\n"	// r0:r1 = c << 5
"		mov	%C1, r0		\n"	// temp_c[7:5] = low 3 bits result
"		or	%D1, r1		\n"	// temp_d[4:0] = high 5 bits result
"		mul	%B1, %2		\n"	// r0:r1 = b << 5
"		mov	%B1, r0		\n"	// temp_b[7:5] = low 3 bits result
"		or	%C1, r1		\n"	// temp_c[4:0] = high 5 bits result
"		mul	%A1, %2		\n"	// r0:r1 = a << 5
"		or	%B1, r1		\n"	// temp_b[4:0] = high 5 bits result
"		eor	%A0, r0		\n"	// a[7:5] ^= low 3 bits result
"		eor	%B0, %B1	\n"	// b ^= temp_b
"		eor	%C0, %C1	\n"	// c ^= temp_c
"		eor	%D0, %D1	\n"	// d ^= temp_d

		// Cleanup
		// Cycles: 1
"		clr	r1		\n"	// restore r0

		: "=d" (y),	// 0
		  "=d" (tmp),	// 1
		  "=d" (fac)	// 2
		: "0" (y)
		: "r0", "r1"
	);

	return y;
}

int main(void)
{
	result = 1; // ok
	while (1) {
		state_c = shr3_c(state_c);
		state_asm_simple = shr3_asm_simple(state_asm_simple);
		state_asm_rshift = shr3_asm_rshift(state_asm_rshift);
		state_asm_rshift_unroll = shr3_asm_rshift_unroll(state_asm_rshift_unroll);
		state_asm_mul = shr3_asm_mul(state_asm_mul);

		if (state_c != state_asm_simple ||
		    state_c != state_asm_rshift ||
		    state_c != state_asm_rshift_unroll ||
		    state_c != state_asm_mul)
			result = 2; // error

		DDRB |= 3;
		if (result == 1) {
			PORTB |= 1; // ok
			PORTB &= (uint8_t)~2;
		} else {
			PORTB &= (uint8_t)~1;
			PORTB |= 2; // error
		}
	}
}