I tried to remove most of the C++ isms, so that this will get more use. The bit_cast really is the most portable way to do this, you need to call memcpy to be safe, unions, reinterpret_cast, etc.. don't cut it. The memcpy will compile away (I've checked on every major version of compiler that people usually care about). This is hacked together from my personal codebase, so if there are any problems let me know and I'll do a more thorough verification.
----------------------------------------------------------------------------------------------------
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 | // Bit casting is the safe portable way to reinterpret bits, this generates exactly what you want it to. (It's free) template <int BYTES> inline void Copy(void* out, const void* in) { __builtin_memcpy(out, in, BYTES); } template <typename TO, typename FROM> inline TO bit_cast(FROM x) { static_assert(sizeof(FROM) == sizeof(TO), "sizeof(FROM) != sizeof(TO)"); TO y; Copy<sizeof(y)>(&y, &x); return y; } inline int32 Round(float x) { return int32(x + ((x < 0.0f) ? -0.5f : +0.5f)); } inline float MultiplyAdd(float x, float y, float z) { return ((x * y) + z); } const float kPi = 3.141592653589793238462643383279502884; void SineCosine(float x, float* sc) { const int32 kNotSignBit(0x7FFFFFFF); const int32 kRangeTiny(0x39800000); // bit_cast<int32>(~0.000244141) // Small angle approximation, handles signed 0's and denormals, skip this if you don't care. int32 i(bit_cast<int32>(x) & kNotSignBit); // |x| as int32 if (i < kRangeTiny) { // |x| < ~0.000244141 sc[0] = x; sc[1] = 1.0f; return; } // Extended precision modular arithmetic. int32 q(Round(x * (2.0f / kPi))); float qf(q); x = MultiplyAdd(qf, -(3.140625f * 0.5f), x); x = MultiplyAdd(qf, -(0.0009670257568359375f * 0.5f), x); x = MultiplyAdd(qf, -(6.2771141529083251953e-7f * 0.5f), x); x = MultiplyAdd(qf, -(1.2154201256553420762e-10 * 0.5f), x); // Note: This can be solved faster in the non-vectorized case by // computing both polynomials at the same time, when vectorized // it is faster to just evaluate them like this. float y(x * x); float s = +2.6083159809786593541503e-6f; s = MultiplyAdd(s, y, -1.981069071916863322258e-4f); s = MultiplyAdd(s, y, +8.33307858556509017944336e-3f); s = MultiplyAdd(s, y, -1.66666597127914428710938e-1f); s = MultiplyAdd(s, (y * x), x); float c = -2.718118424e-7f; c = MultiplyAdd(c, y, +2.479904470e-5f); c = MultiplyAdd(c, y, -1.388887875e-3f); c = MultiplyAdd(c, y, +4.166666418e-2f); c = MultiplyAdd(c, y, -0.5f); c = MultiplyAdd(c, y, +1.0f); if (q & 1) { // maybe swap sine and cosine float swap = c; c = s; s = swap; } if (q & 2) { // maybe negate sine s = -s; } if ((q + 1) & 2) { // maybe negate cosine c = -c; } sc[0] = s; sc[1] = c; } |