PlatformIO package of the Teensy core framework compatible with GCC 10 & C++20
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

clockless_trinket.h 22KB

3 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467
  1. #ifndef __INC_CLOCKLESS_TRINKET_H
  2. #define __INC_CLOCKLESS_TRINKET_H
  3. #include "../../controller.h"
  4. #include "../../lib8tion.h"
  5. #include <avr/interrupt.h> // for cli/se definitions
  6. FASTLED_NAMESPACE_BEGIN
  7. #if defined(FASTLED_AVR)
  8. // Scaling macro choice
  9. #ifndef TRINKET_SCALE
  10. #define TRINKET_SCALE 1
  11. // whether or not to use dithering
  12. #define DITHER 1
  13. #endif
  14. #if (F_CPU==8000000)
  15. #define FASTLED_SLOW_CLOCK_ADJUST // asm __volatile__ ("mov r0,r0\n\t");
  16. #else
  17. #define FASTLED_SLOW_CLOCK_ADJUST
  18. #endif
  19. #define US_PER_TICK (64 / (F_CPU/1000000))
  20. // Variations on the functions in delay.h - w/a loop var passed in to preserve registers across calls by the optimizer/compiler
  21. template<int CYCLES> inline void _dc(register uint8_t & loopvar);
  22. template<int _LOOP, int PAD> __attribute__((always_inline)) inline void _dc_AVR(register uint8_t & loopvar) {
  23. _dc<PAD>(loopvar);
  24. // The convolution in here is to ensure that the state of the carry flag coming into the delay loop is preserved
  25. asm __volatile__ ( "BRCS L_PC%=\n\t"
  26. " LDI %[loopvar], %[_LOOP]\n\tL_%=: DEC %[loopvar]\n\t BRNE L_%=\n\tBREQ L_DONE%=\n\t"
  27. "L_PC%=: LDI %[loopvar], %[_LOOP]\n\tLL_%=: DEC %[loopvar]\n\t BRNE LL_%=\n\tBSET 0\n\t"
  28. "L_DONE%=:\n\t"
  29. :
  30. [loopvar] "+a" (loopvar) : [_LOOP] "M" (_LOOP) : );
  31. }
  32. template<int CYCLES> __attribute__((always_inline)) inline void _dc(register uint8_t & loopvar) {
  33. _dc_AVR<CYCLES/6,CYCLES%6>(loopvar);
  34. }
  35. template<> __attribute__((always_inline)) inline void _dc<-6>(register uint8_t & ) {}
  36. template<> __attribute__((always_inline)) inline void _dc<-5>(register uint8_t & ) {}
  37. template<> __attribute__((always_inline)) inline void _dc<-4>(register uint8_t & ) {}
  38. template<> __attribute__((always_inline)) inline void _dc<-3>(register uint8_t & ) {}
  39. template<> __attribute__((always_inline)) inline void _dc<-2>(register uint8_t & ) {}
  40. template<> __attribute__((always_inline)) inline void _dc<-1>(register uint8_t & ) {}
  41. template<> __attribute__((always_inline)) inline void _dc< 0>(register uint8_t & ) {}
  42. template<> __attribute__((always_inline)) inline void _dc< 1>(register uint8_t & ) {asm __volatile__("mov r0,r0":::);}
  43. template<> __attribute__((always_inline)) inline void _dc< 2>(register uint8_t & ) {asm __volatile__("rjmp .+0":::);}
  44. template<> __attribute__((always_inline)) inline void _dc< 3>(register uint8_t & loopvar) { _dc<2>(loopvar); _dc<1>(loopvar); }
  45. template<> __attribute__((always_inline)) inline void _dc< 4>(register uint8_t & loopvar) { _dc<2>(loopvar); _dc<2>(loopvar); }
  46. template<> __attribute__((always_inline)) inline void _dc< 5>(register uint8_t & loopvar) { _dc<2>(loopvar); _dc<3>(loopvar); }
  47. template<> __attribute__((always_inline)) inline void _dc< 6>(register uint8_t & loopvar) { _dc<2>(loopvar); _dc<2>(loopvar); _dc<2>(loopvar);}
  48. template<> __attribute__((always_inline)) inline void _dc< 7>(register uint8_t & loopvar) { _dc<4>(loopvar); _dc<3>(loopvar); }
  49. template<> __attribute__((always_inline)) inline void _dc< 8>(register uint8_t & loopvar) { _dc<4>(loopvar); _dc<4>(loopvar); }
  50. template<> __attribute__((always_inline)) inline void _dc< 9>(register uint8_t & loopvar) { _dc<5>(loopvar); _dc<4>(loopvar); }
  51. template<> __attribute__((always_inline)) inline void _dc<10>(register uint8_t & loopvar) { _dc<6>(loopvar); _dc<4>(loopvar); }
  52. template<> __attribute__((always_inline)) inline void _dc<11>(register uint8_t & loopvar) { _dc<10>(loopvar); _dc<1>(loopvar); }
  53. template<> __attribute__((always_inline)) inline void _dc<12>(register uint8_t & loopvar) { _dc<10>(loopvar); _dc<2>(loopvar); }
  54. template<> __attribute__((always_inline)) inline void _dc<13>(register uint8_t & loopvar) { _dc<10>(loopvar); _dc<3>(loopvar); }
  55. template<> __attribute__((always_inline)) inline void _dc<14>(register uint8_t & loopvar) { _dc<10>(loopvar); _dc<4>(loopvar); }
  56. template<> __attribute__((always_inline)) inline void _dc<15>(register uint8_t & loopvar) { _dc<10>(loopvar); _dc<5>(loopvar); }
  57. template<> __attribute__((always_inline)) inline void _dc<16>(register uint8_t & loopvar) { _dc<10>(loopvar); _dc<6>(loopvar); }
  58. template<> __attribute__((always_inline)) inline void _dc<17>(register uint8_t & loopvar) { _dc<10>(loopvar); _dc<7>(loopvar); }
  59. template<> __attribute__((always_inline)) inline void _dc<18>(register uint8_t & loopvar) { _dc<10>(loopvar); _dc<8>(loopvar); }
  60. template<> __attribute__((always_inline)) inline void _dc<19>(register uint8_t & loopvar) { _dc<10>(loopvar); _dc<9>(loopvar); }
  61. template<> __attribute__((always_inline)) inline void _dc<20>(register uint8_t & loopvar) { _dc<10>(loopvar); _dc<10>(loopvar); }
  62. #define DINTPIN(T,ADJ,PINADJ) (T-(PINADJ+ADJ)>0) ? _dc<T-(PINADJ+ADJ)>(loopvar) : _dc<0>(loopvar);
  63. #define DINT(T,ADJ) if(AVR_PIN_CYCLES(DATA_PIN)==1) { DINTPIN(T,ADJ,1) } else { DINTPIN(T,ADJ,2); }
  64. #define _D1(ADJ) DINT(T1,ADJ)
  65. #define _D2(ADJ) DINT(T2,ADJ)
  66. #define _D3(ADJ) DINT(T3,ADJ)
  67. //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  68. //
  69. // Base template for clockless controllers. These controllers have 3 control points in their cycle for each bit. The first point
  70. // is where the line is raised hi. The second point is where the line is dropped low for a zero. The third point is where the
  71. // line is dropped low for a one. T1, T2, and T3 correspond to the timings for those three in clock cycles.
  72. //
  73. //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  74. #if (!defined(NO_CORRECTION) || (NO_CORRECTION == 0)) && (FASTLED_ALLOW_INTERRUPTS == 0)
  75. static uint8_t gTimeErrorAccum256ths;
  76. #endif
  77. #define FASTLED_HAS_CLOCKLESS 1
  78. template <uint8_t DATA_PIN, int T1, int T2, int T3, EOrder RGB_ORDER = RGB, int XTRA0 = 0, bool FLIP = false, int WAIT_TIME = 10>
  79. class ClocklessController : public CPixelLEDController<RGB_ORDER> {
  80. static_assert(T1 >= 2 && T2 >= 2 && T3 >= 3, "Not enough cycles - use a higher clock speed");
  81. typedef typename FastPin<DATA_PIN>::port_ptr_t data_ptr_t;
  82. typedef typename FastPin<DATA_PIN>::port_t data_t;
  83. CMinWait<WAIT_TIME> mWait;
  84. public:
  85. virtual void init() {
  86. FastPin<DATA_PIN>::setOutput();
  87. }
  88. virtual uint16_t getMaxRefreshRate() const { return 400; }
  89. protected:
  90. virtual void showPixels(PixelController<RGB_ORDER> & pixels) {
  91. mWait.wait();
  92. cli();
  93. showRGBInternal(pixels);
  94. // Adjust the timer
  95. #if (!defined(NO_CORRECTION) || (NO_CORRECTION == 0)) && (FASTLED_ALLOW_INTERRUPTS == 0)
  96. uint32_t microsTaken = (uint32_t)pixels.size() * (uint32_t)CLKS_TO_MICROS(24 * (T1 + T2 + T3));
  97. // adust for approximate observed actal runtime (as of January 2015)
  98. // roughly 9.6 cycles per pixel, which is 0.6us/pixel at 16MHz
  99. // microsTaken += nLeds * 0.6 * CLKS_TO_MICROS(16);
  100. microsTaken += scale16by8(pixels.size(),(0.6 * 256) + 1) * CLKS_TO_MICROS(16);
  101. // if less than 1000us, there is NO timer impact,
  102. // this is because the ONE interrupt that might come in while interrupts
  103. // are disabled is queued up, and it will be serviced as soon as
  104. // interrupts are re-enabled.
  105. // This actually should technically also account for the runtime of the
  106. // interrupt handler itself, but we're just not going to worry about that.
  107. if( microsTaken > 1000) {
  108. // Since up to one timer tick will be queued, we don't need
  109. // to adjust the MS_COUNTER for that one.
  110. microsTaken -= 1000;
  111. // Now convert microseconds to 256ths of a second, approximately like this:
  112. // 250ths = (us/4)
  113. // 256ths = 250ths * (263/256);
  114. uint16_t x256ths = microsTaken >> 2;
  115. x256ths += scale16by8(x256ths,7);
  116. x256ths += gTimeErrorAccum256ths;
  117. MS_COUNTER += (x256ths >> 8);
  118. gTimeErrorAccum256ths = x256ths & 0xFF;
  119. }
  120. #if 0
  121. // For pixel counts of 30 and under at 16Mhz, no correction is necessary.
  122. // For pixel counts of 15 and under at 8Mhz, no correction is necessary.
  123. //
  124. // This code, below, is smaller, and quicker clock correction, which drifts much
  125. // more significantly, but is a few bytes smaller. Presented here for consideration
  126. // as an alternate on the ATtiny, which can't have more than about 150 pixels MAX
  127. // anyway, meaning that microsTaken will never be more than about 4,500, which fits in
  128. // a 16-bit variable. The difference between /1000 and /1024 only starts showing
  129. // up in the range of about 100 pixels, so many ATtiny projects won't even
  130. // see a clock difference due to the approximation there.
  131. uint16_t microsTaken = (uint32_t)nLeds * (uint32_t)CLKS_TO_MICROS((24) * (T1 + T2 + T3));
  132. MS_COUNTER += (microsTaken >> 10);
  133. #endif
  134. #endif
  135. sei();
  136. mWait.mark();
  137. }
  138. #define USE_ASM_MACROS
  139. // The variables that our various asm statements use. The same block of variables needs to be declared for
  140. // all the asm blocks because GCC is pretty stupid and it would clobber variables happily or optimize code away too aggressively
  141. #define ASM_VARS : /* write variables */ \
  142. [count] "+x" (count), \
  143. [data] "+z" (data), \
  144. [b1] "+a" (b1), \
  145. [d0] "+r" (d0), \
  146. [d1] "+r" (d1), \
  147. [d2] "+r" (d2), \
  148. [loopvar] "+a" (loopvar), \
  149. [scale_base] "+a" (scale_base) \
  150. : /* use variables */ \
  151. [ADV] "r" (advanceBy), \
  152. [b0] "a" (b0), \
  153. [hi] "r" (hi), \
  154. [lo] "r" (lo), \
  155. [s0] "r" (s0), \
  156. [s1] "r" (s1), \
  157. [s2] "r" (s2), \
  158. [e0] "r" (e0), \
  159. [e1] "r" (e1), \
  160. [e2] "r" (e2), \
  161. [PORT] "M" (FastPin<DATA_PIN>::port()-0x20), \
  162. [O0] "M" (RGB_BYTE0(RGB_ORDER)), \
  163. [O1] "M" (RGB_BYTE1(RGB_ORDER)), \
  164. [O2] "M" (RGB_BYTE2(RGB_ORDER)) \
  165. : "cc" /* clobber registers */
  166. // Note: the code in the else in HI1/LO1 will be turned into an sts (2 cycle, 2 word) opcode
  167. // 1 cycle, write hi to the port
  168. #define HI1 FASTLED_SLOW_CLOCK_ADJUST if((int)(FastPin<DATA_PIN>::port())-0x20 < 64) { asm __volatile__("out %[PORT], %[hi]" ASM_VARS ); } else { *FastPin<DATA_PIN>::port()=hi; }
  169. // 1 cycle, write lo to the port
  170. #define LO1 if((int)(FastPin<DATA_PIN>::port())-0x20 < 64) { asm __volatile__("out %[PORT], %[lo]" ASM_VARS ); } else { *FastPin<DATA_PIN>::port()=lo; }
  171. // 2 cycles, sbrs on flipping the line to lo if we're pushing out a 0
  172. #define QLO2(B, N) asm __volatile__("sbrs %[" #B "], " #N ASM_VARS ); LO1;
  173. // load a byte from ram into the given var with the given offset
  174. #define LD2(B,O) asm __volatile__("ldd %[" #B "], Z + %[" #O "]\n\t" ASM_VARS );
  175. // 4 cycles - load a byte from ram into the scaling scratch space with the given offset, clear the target var, clear carry
  176. #define LDSCL4(B,O) asm __volatile__("ldd %[scale_base], Z + %[" #O "]\n\tclr %[" #B "]\n\tclc\n\t" ASM_VARS );
  177. #if (DITHER==1)
  178. // apply dithering value before we do anything with scale_base
  179. #define PRESCALE4(D) asm __volatile__("cpse %[scale_base], __zero_reg__\n\t add %[scale_base],%[" #D "]\n\tbrcc L_%=\n\tldi %[scale_base], 0xFF\n\tL_%=:\n\t" ASM_VARS);
  180. // Do the add for the prescale
  181. #define PRESCALEA2(D) asm __volatile__("cpse %[scale_base], __zero_reg__\n\t add %[scale_base],%[" #D "]\n\t" ASM_VARS);
  182. // Do the clamp for the prescale, clear carry when we're done - NOTE: Must ensure carry flag state is preserved!
  183. #define PRESCALEB4(D) asm __volatile__("brcc L_%=\n\tldi %[scale_base], 0xFF\n\tL_%=:\n\tneg %[" #D "]\n\tCLC" ASM_VARS);
  184. // Clamp for prescale, increment data, since we won't ever wrap 65k, this also effectively clears carry for us
  185. #define PSBIDATA4(D) asm __volatile__("brcc L_%=\n\tldi %[scale_base], 0xFF\n\tL_%=:\n\tadd %A[data], %[ADV]\n\tadc %B[data], __zero_reg__\n\t" ASM_VARS);
  186. #else
  187. #define PRESCALE4(D) _dc<4>(loopvar);
  188. #define PRESCALEA2(D) _dc<2>(loopvar);
  189. #define PRESCALEB4(D) _dc<4>(loopvar);
  190. #define PSBIDATA4(D) asm __volatile__( "add %A[data], %[ADV]\n\tadc %B[data], __zero_reg__\n\trjmp .+0\n\t" ASM_VARS );
  191. #endif
  192. // 2 cycles - perform one step of the scaling (if a given bit is set in scale, add scale-base to the scratch space)
  193. #define _SCALE02(B, N) "sbrc %[s0], " #N "\n\tadd %[" #B "], %[scale_base]\n\t"
  194. #define _SCALE12(B, N) "sbrc %[s1], " #N "\n\tadd %[" #B "], %[scale_base]\n\t"
  195. #define _SCALE22(B, N) "sbrc %[s2], " #N "\n\tadd %[" #B "], %[scale_base]\n\t"
  196. #define SCALE02(B,N) asm __volatile__( _SCALE02(B,N) ASM_VARS );
  197. #define SCALE12(B,N) asm __volatile__( _SCALE12(B,N) ASM_VARS );
  198. #define SCALE22(B,N) asm __volatile__( _SCALE22(B,N) ASM_VARS );
  199. // 1 cycle - rotate right, pulling in from carry
  200. #define _ROR1(B) "ror %[" #B "]\n\t"
  201. #define ROR1(B) asm __volatile__( _ROR1(B) ASM_VARS);
  202. // 1 cycle, clear the carry bit
  203. #define _CLC1 "clc\n\t"
  204. #define CLC1 asm __volatile__( _CLC1 ASM_VARS );
  205. // 2 cycles, rortate right, pulling in from carry then clear the carry bit
  206. #define RORCLC2(B) asm __volatile__( _ROR1(B) _CLC1 ASM_VARS );
  207. // 4 cycles, rotate, clear carry, scale next bit
  208. #define RORSC04(B, N) asm __volatile__( _ROR1(B) _CLC1 _SCALE02(B, N) ASM_VARS );
  209. #define RORSC14(B, N) asm __volatile__( _ROR1(B) _CLC1 _SCALE12(B, N) ASM_VARS );
  210. #define RORSC24(B, N) asm __volatile__( _ROR1(B) _CLC1 _SCALE22(B, N) ASM_VARS );
  211. // 4 cycles, scale bit, rotate, clear carry
  212. #define SCROR04(B, N) asm __volatile__( _SCALE02(B,N) _ROR1(B) _CLC1 ASM_VARS );
  213. #define SCROR14(B, N) asm __volatile__( _SCALE12(B,N) _ROR1(B) _CLC1 ASM_VARS );
  214. #define SCROR24(B, N) asm __volatile__( _SCALE22(B,N) _ROR1(B) _CLC1 ASM_VARS );
  215. /////////////////////////////////////////////////////////////////////////////////////
  216. // Loop life cycle
  217. // dither adjustment macro - should be kept in sync w/what's in stepDithering
  218. // #define ADJDITHER2(D, E) D = E - D;
  219. #define _NEGD1(D) "neg %[" #D "]\n\t"
  220. #define _ADJD1(D,E) "add %[" #D "], %[" #E "]\n\t"
  221. #define ADJDITHER2(D, E) asm __volatile__ ( _NEGD1(D) _ADJD1(D, E) ASM_VARS);
  222. #define ADDDE1(D, E) asm __volatile__ ( _ADJD1(D, E) ASM_VARS );
  223. // #define xstr(a) str(a)
  224. // #define str(a) #a
  225. // #define ADJDITHER2(D,E) asm __volatile__("subi %[" #D "], " xstr(DUSE) "\n\tand %[" #D "], %[" #E "]\n\t" ASM_VARS);
  226. // define the beginning of the loop
  227. #define LOOP asm __volatile__("1:" ASM_VARS );
  228. // define the end of the loop
  229. #define DONE asm __volatile__("2:" ASM_VARS );
  230. // 2 cycles - increment the data pointer
  231. #define IDATA2 asm __volatile__("add %A[data], %[ADV]\n\tadc %B[data], __zero_reg__\n\t" ASM_VARS );
  232. #define IDATACLC3 asm __volatile__("add %A[data], %[ADV]\n\tadc %B[data], __zero_reg__\n\t" _CLC1 ASM_VARS );
  233. // 1 cycle mov
  234. #define _MOV1(B1, B2) "mov %[" #B1 "], %[" #B2 "]\n\t"
  235. #define MOV1(B1, B2) asm __volatile__( _MOV1(B1,B2) ASM_VARS );
  236. // 3 cycle mov - skip if scale fix is happening
  237. #if (FASTLED_SCALE8_FIXED == 1)
  238. #define _MOV_FIX03(B1, B2) "mov %[" #B1 "], %[scale_base]\n\tcpse %[s0], __zero_reg__\n\t" _MOV1(B1, B2)
  239. #define _MOV_FIX13(B1, B2) "mov %[" #B1 "], %[scale_base]\n\tcpse %[s1], __zero_reg__\n\t" _MOV1(B1, B2)
  240. #define _MOV_FIX23(B1, B2) "mov %[" #B1 "], %[scale_base]\n\tcpse %[s2], __zero_reg__\n\t" _MOV1(B1, B2)
  241. #else
  242. // if we haven't fixed scale8, just do the move and nop the 2 cycles that would be used to
  243. // do the fixed adjustment
  244. #define _MOV_FIX03(B1, B2) _MOV1(B1, B2) "rjmp .+0\n\t"
  245. #define _MOV_FIX13(B1, B2) _MOV1(B1, B2) "rjmp .+0\n\t"
  246. #define _MOV_FIX23(B1, B2) _MOV1(B1, B2) "rjmp .+0\n\t"
  247. #endif
  248. // 3 cycle mov + negate D for dither adjustment
  249. #define MOV_NEGD04(B1, B2, D) asm __volatile( _MOV_FIX03(B1, B2) _NEGD1(D) ASM_VARS );
  250. #define MOV_ADDDE04(B1, B2, D, E) asm __volatile( _MOV_FIX03(B1, B2) _ADJD1(D, E) ASM_VARS );
  251. #define MOV_NEGD14(B1, B2, D) asm __volatile( _MOV_FIX13(B1, B2) _NEGD1(D) ASM_VARS );
  252. #define MOV_ADDDE14(B1, B2, D, E) asm __volatile( _MOV_FIX13(B1, B2) _ADJD1(D, E) ASM_VARS );
  253. #define MOV_NEGD24(B1, B2, D) asm __volatile( _MOV_FIX23(B1, B2) _NEGD1(D) ASM_VARS );
  254. // 2 cycles - decrement the counter
  255. #define DCOUNT2 asm __volatile__("sbiw %[count], 1" ASM_VARS );
  256. // 2 cycles - jump to the beginning of the loop
  257. #define JMPLOOP2 asm __volatile__("rjmp 1b" ASM_VARS );
  258. // 2 cycles - jump out of the loop
  259. #define BRLOOP1 asm __volatile__("brne 3\n\trjmp 2f\n\t3:" ASM_VARS );
  260. // 5 cycles 2 sbiw, 3 for the breq/rjmp
  261. #define ENDLOOP5 asm __volatile__("sbiw %[count], 1\n\tbreq L_%=\n\trjmp 1b\n\tL_%=:\n\t" ASM_VARS);
  262. // NOP using the variables, forcing a move
  263. #define DNOP asm __volatile__("mov r0,r0" ASM_VARS);
  264. #define DADVANCE 3
  265. #define DUSE (0xFF - (DADVANCE-1))
  266. // Silence compiler warnings about switch/case that is explicitly intended to fall through.
  267. //#define FL_FALLTHROUGH __attribute__ ((fallthrough));
  268. // This method is made static to force making register Y available to use for data on AVR - if the method is non-static, then
  269. // gcc will use register Y for the this pointer.
  270. static void /*__attribute__((optimize("O0")))*/ /*__attribute__ ((always_inline))*/ showRGBInternal(PixelController<RGB_ORDER> & pixels) {
  271. uint8_t *data = (uint8_t*)pixels.mData;
  272. data_ptr_t port = FastPin<DATA_PIN>::port();
  273. data_t mask = FastPin<DATA_PIN>::mask();
  274. uint8_t scale_base = 0;
  275. // register uint8_t *end = data + nLeds;
  276. data_t hi = *port | mask;
  277. data_t lo = *port & ~mask;
  278. *port = lo;
  279. // the byte currently being written out
  280. uint8_t b0 = 0;
  281. // the byte currently being worked on to write the next out
  282. uint8_t b1 = 0;
  283. // Setup the pixel controller
  284. pixels.preStepFirstByteDithering();
  285. // pull the dithering/adjustment values out of the pixels object for direct asm access
  286. uint8_t advanceBy = pixels.advanceBy();
  287. uint16_t count = pixels.mLen;
  288. uint8_t s0 = pixels.mScale.raw[RO(0)];
  289. uint8_t s1 = pixels.mScale.raw[RO(1)];
  290. uint8_t s2 = pixels.mScale.raw[RO(2)];
  291. #if (FASTLED_SCALE8_FIXED==1)
  292. s0++; s1++; s2++;
  293. #endif
  294. uint8_t d0 = pixels.d[RO(0)];
  295. uint8_t d1 = pixels.d[RO(1)];
  296. uint8_t d2 = pixels.d[RO(2)];
  297. uint8_t e0 = pixels.e[RO(0)];
  298. uint8_t e1 = pixels.e[RO(1)];
  299. uint8_t e2 = pixels.e[RO(2)];
  300. uint8_t loopvar=0;
  301. // This has to be done in asm to keep gcc from messing up the asm code further down
  302. b0 = data[RO(0)];
  303. {
  304. LDSCL4(b0,O0) PRESCALEA2(d0)
  305. PRESCALEB4(d0) SCALE02(b0,0)
  306. RORSC04(b0,1) ROR1(b0) CLC1
  307. SCROR04(b0,2) SCALE02(b0,3)
  308. RORSC04(b0,4) ROR1(b0) CLC1
  309. SCROR04(b0,5) SCALE02(b0,6)
  310. RORSC04(b0,7) ROR1(b0) CLC1
  311. MOV_ADDDE04(b1,b0,d0,e0)
  312. MOV1(b0,b1)
  313. }
  314. {
  315. // while(--count)
  316. {
  317. // Loop beginning
  318. DNOP;
  319. LOOP;
  320. // Sum of the clock counts across each row should be 10 for 8Mhz, WS2811
  321. // The values in the D1/D2/D3 indicate how many cycles the previous column takes
  322. // to allow things to line back up.
  323. //
  324. // While writing out byte 0, we're loading up byte 1, applying the dithering adjustment,
  325. // then scaling it using 8 cycles of shift/add interleaved in between writing the bits
  326. // out. When doing byte 1, we're doing the above for byte 2. When we're doing byte 2,
  327. // we're cycling back around and doing the above for byte 0.
  328. // Inline scaling - RGB ordering
  329. // DNOP
  330. HI1 _D1(1) QLO2(b0, 7) LDSCL4(b1,O1) _D2(4) LO1 PRESCALEA2(d1) _D3(2)
  331. HI1 _D1(1) QLO2(b0, 6) PRESCALEB4(d1) _D2(4) LO1 SCALE12(b1,0) _D3(2)
  332. HI1 _D1(1) QLO2(b0, 5) RORSC14(b1,1) _D2(4) LO1 RORCLC2(b1) _D3(2)
  333. HI1 _D1(1) QLO2(b0, 4) SCROR14(b1,2) _D2(4) LO1 SCALE12(b1,3) _D3(2)
  334. HI1 _D1(1) QLO2(b0, 3) RORSC14(b1,4) _D2(4) LO1 RORCLC2(b1) _D3(2)
  335. HI1 _D1(1) QLO2(b0, 2) SCROR14(b1,5) _D2(4) LO1 SCALE12(b1,6) _D3(2)
  336. HI1 _D1(1) QLO2(b0, 1) RORSC14(b1,7) _D2(4) LO1 RORCLC2(b1) _D3(2)
  337. HI1 _D1(1) QLO2(b0, 0)
  338. switch(XTRA0) {
  339. case 4: _D2(0) LO1 _D3(0) HI1 _D1(1) QLO2(b0,0) /* fall through */
  340. case 3: _D2(0) LO1 _D3(0) HI1 _D1(1) QLO2(b0,0) /* fall through */
  341. case 2: _D2(0) LO1 _D3(0) HI1 _D1(1) QLO2(b0,0) /* fall through */
  342. case 1: _D2(0) LO1 _D3(0) HI1 _D1(1) QLO2(b0,0)
  343. }
  344. MOV_ADDDE14(b0,b1,d1,e1) _D2(4) LO1 _D3(0)
  345. HI1 _D1(1) QLO2(b0, 7) LDSCL4(b1,O2) _D2(4) LO1 PRESCALEA2(d2) _D3(2)
  346. HI1 _D1(1) QLO2(b0, 6) PSBIDATA4(d2) _D2(4) LO1 SCALE22(b1,0) _D3(2)
  347. HI1 _D1(1) QLO2(b0, 5) RORSC24(b1,1) _D2(4) LO1 RORCLC2(b1) _D3(2)
  348. HI1 _D1(1) QLO2(b0, 4) SCROR24(b1,2) _D2(4) LO1 SCALE22(b1,3) _D3(2)
  349. HI1 _D1(1) QLO2(b0, 3) RORSC24(b1,4) _D2(4) LO1 RORCLC2(b1) _D3(2)
  350. HI1 _D1(1) QLO2(b0, 2) SCROR24(b1,5) _D2(4) LO1 SCALE22(b1,6) _D3(2)
  351. HI1 _D1(1) QLO2(b0, 1) RORSC24(b1,7) _D2(4) LO1 RORCLC2(b1) _D3(2)
  352. HI1 _D1(1) QLO2(b0, 0)
  353. switch(XTRA0) {
  354. case 4: _D2(0) LO1 _D3(0) HI1 _D1(1) QLO2(b0,0) /* fall through */
  355. case 3: _D2(0) LO1 _D3(0) HI1 _D1(1) QLO2(b0,0) /* fall through */
  356. case 2: _D2(0) LO1 _D3(0) HI1 _D1(1) QLO2(b0,0) /* fall through */
  357. case 1: _D2(0) LO1 _D3(0) HI1 _D1(1) QLO2(b0,0)
  358. }
  359. // Because Prescale on the middle byte also increments the data counter,
  360. // we have to do both halves of updating d2 here - negating it (in the
  361. // MOV_NEGD24 macro) and then adding E back into it
  362. MOV_NEGD24(b0,b1,d2) _D2(4) LO1 ADDDE1(d2,e2) _D3(1)
  363. HI1 _D1(1) QLO2(b0, 7) LDSCL4(b1,O0) _D2(4) LO1 PRESCALEA2(d0) _D3(2)
  364. HI1 _D1(1) QLO2(b0, 6) PRESCALEB4(d0) _D2(4) LO1 SCALE02(b1,0) _D3(2)
  365. HI1 _D1(1) QLO2(b0, 5) RORSC04(b1,1) _D2(4) LO1 RORCLC2(b1) _D3(2)
  366. HI1 _D1(1) QLO2(b0, 4) SCROR04(b1,2) _D2(4) LO1 SCALE02(b1,3) _D3(2)
  367. HI1 _D1(1) QLO2(b0, 3) RORSC04(b1,4) _D2(4) LO1 RORCLC2(b1) _D3(2)
  368. HI1 _D1(1) QLO2(b0, 2) SCROR04(b1,5) _D2(4) LO1 SCALE02(b1,6) _D3(2)
  369. HI1 _D1(1) QLO2(b0, 1) RORSC04(b1,7) _D2(4) LO1 RORCLC2(b1) _D3(2)
  370. HI1 _D1(1) QLO2(b0, 0)
  371. switch(XTRA0) {
  372. case 4: _D2(0) LO1 _D3(0) HI1 _D1(1) QLO2(b0,0) /* fall through */
  373. case 3: _D2(0) LO1 _D3(0) HI1 _D1(1) QLO2(b0,0) /* fall through */
  374. case 2: _D2(0) LO1 _D3(0) HI1 _D1(1) QLO2(b0,0) /* fall through */
  375. case 1: _D2(0) LO1 _D3(0) HI1 _D1(1) QLO2(b0,0)
  376. }
  377. MOV_ADDDE04(b0,b1,d0,e0) _D2(4) LO1 _D3(5)
  378. ENDLOOP5
  379. }
  380. DONE;
  381. }
  382. #if (FASTLED_ALLOW_INTERRUPTS == 1)
  383. // stop using the clock juggler
  384. TCCR0A &= ~0x30;
  385. #endif
  386. }
  387. };
  388. #endif
  389. FASTLED_NAMESPACE_END
  390. #endif