-
Notifications
You must be signed in to change notification settings - Fork 80
/
constant_fast_div.h
107 lines (99 loc) · 3.42 KB
/
constant_fast_div.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
/*
* When dividing by a known compile time constant, the division can be replaced
* by a multiply+shift operation. GCC will do this automatically,
* *BUT ONLY FOR DIVISION OF REGISTER-WIDTH OR NARROWER*.
*
* So on an 8-bit system, 16-bit divides will *NOT* be optimised.
*
* The macros here manually apply the multiply+shift operation for 16-bit numbers.
*
* Testing on an AtMega2560, -O3 optimizations:
* Performance improvement of 85% to 90%+ speed up (division by non-powers of 2)
* Zero increase in RAM usage
* Average of 25 bytes Flash used per call site
* Be careful calling this in a loop with aggressive loop unrolling!
*
* Note: testing of the multiply+shift technique on 8-bit division showed a
* slight slow down over native code on AtMega2560. So the 8 bit equivalent
* macros have not been included
*/
#pragma once
#include "libdivide.h"
#include "u16_ldparams.h"
#include "s16_ldparams.h"
#define CAT_HELPER(a, b) a ## b
#define CONCAT(A, B) CAT_HELPER(A, B)
// GCC will optimise division by a power of 2
// So allow that.
#define S16_ISPOW2_NEG(denom) \
(denom==-2 || \
denom==-4 || \
denom==-8 || \
denom==-16 || \
denom==-32 || \
denom==-64 || \
denom==-128 || \
denom==-256 || \
denom==-512 || \
denom==-1024 || \
denom==-2048 || \
denom==-4096 || \
denom==-8192 || \
denom==-16384)
#define S16_ISPOW2_POS(denom) \
(denom==2 || \
denom==4 || \
denom==8 || \
denom==16 || \
denom==32 || \
denom==64 || \
denom==128 || \
denom==256 || \
denom==512 || \
denom==1024 || \
denom==2048 || \
denom==4096 || \
denom==8192 || \
denom==16384)
#define U16_ISPOW2(denom) (S16_ISPOW2_POS(denom) || denom==32768)
#define S16_ISPOW2(denom) (S16_ISPOW2_POS(denom) || S16_ISPOW2_NEG(denom))
// Apply the libdivide namespace if necessary
#ifdef __cplusplus
#define LIB_DIV_NAMESPACE libdivide::
#else
#define LIB_DIV_NAMESPACE
#endif
/*
* Wrapper for *unsigned* 16-bit DIVISION. The divisor must be a compile time
* constant.
* E.g. FAST_DIV16U(value, 100)
*/
#define U16_MAGIC(d) CONCAT(CONCAT(U16LD_DENOM_, d), _MAGIC)
#define U16_MORE(d) CONCAT(CONCAT(U16LD_DENOM_, d), _MORE)
#define FAST_DIV16U(a, d) (U16_ISPOW2(d) ? a/d : LIB_DIV_NAMESPACE libdivide_u16_do_raw(a, U16_MAGIC(d), U16_MORE(d)))
/*
* Wrapper for *signed* 16-bit DIVISION by a *POSITIVE* compile time constant.
* E.g. FAST_DIV16(-value, 777)
*
* This only works for positive parmeters :-(
* A negative number results in a hypen in the macro name, which is not allowed
*/
#define S16_MAGIC(d) CONCAT(CONCAT(S16LD_DENOM_, d), _MAGIC)
#define S16_MORE(d) CONCAT(CONCAT(S16LD_DENOM_, d), _MORE)
#define FAST_DIV16(a, d) (S16_ISPOW2(d) ? a/d : LIB_DIV_NAMESPACE libdivide_s16_do_raw(a, S16_MAGIC(d), S16_MORE(d)))
/*
* Wrapper for *signed* 16-bit DIVISION by a *NEGATIVE* compile time constant.
* E.g. FAST_DIV16_NEG(-value, 777) // <-- It's converted to negative. Really.
*
* This only works for positive parmeters :-(
* A negative number results in a hypen in the macro name, which is not allowed
*/
#define S16_MAGIC_NEG(d) CONCAT(CONCAT(S16LD_DENOM_MINUS_, d), _MAGIC)
#define S16_MORE_NEG(d) CONCAT(CONCAT(S16LD_DENOM_MINUS_, d), _MORE)
#define FAST_DIV16_NEG(a, d) (S16_ISPOW2(d) ? a/-d : LIB_DIV_NAMESPACE libdivide_s16_do_raw(a, S16_MAGIC_NEG(d), S16_MORE_NEG(d)))
/*
* Wrapper for *unsigned* 16-bit MODULUS. The divisor must be a compile time
* constant.
* E.g. FAST_MOD16U(value, 6)
*/
#define FAST_MOD16U(a, d) (a - (FAST_DIV16U(a, d) * d))