PlatformIO package of the Teensy core framework compatible with GCC 10 & C++20
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

390 lines
14KB

  1. #ifndef __INC_M0_CLOCKLESS_H
  2. #define __INC_M0_CLOCKLESS_H
  3. struct M0ClocklessData {
  4. uint8_t d[3];
  5. uint8_t e[3];
  6. uint8_t adj;
  7. uint8_t pad;
  8. uint32_t s[3];
  9. };
  10. template<int HI_OFFSET, int LO_OFFSET, int T1, int T2, int T3, EOrder RGB_ORDER, int WAIT_TIME>int
  11. showLedData(volatile uint32_t *_port, uint32_t _bitmask, const uint8_t *_leds, uint32_t num_leds, struct M0ClocklessData *pData) {
  12. // Lo register variables
  13. register uint32_t scratch=0;
  14. register struct M0ClocklessData *base = pData;
  15. register volatile uint32_t *port = _port;
  16. register uint32_t d=0;
  17. register uint32_t counter=num_leds;
  18. register uint32_t bn=0;
  19. register uint32_t b=0;
  20. register uint32_t bitmask = _bitmask;
  21. // high register variable
  22. register const uint8_t *leds = _leds;
  23. #if (FASTLED_SCALE8_FIXED == 1)
  24. pData->s[0]++;
  25. pData->s[1]++;
  26. pData->s[2]++;
  27. #endif
  28. asm __volatile__ (
  29. ///////////////////////////////////////////////////////////////////////////
  30. //
  31. // asm macro definitions - used to assemble the clockless output
  32. //
  33. ".ifnotdef fl_delay_def;"
  34. #ifdef FASTLED_ARM_M0_PLUS
  35. " .set fl_is_m0p, 1;"
  36. " .macro m0pad;"
  37. " nop;"
  38. " .endm;"
  39. #else
  40. " .set fl_is_m0p, 0;"
  41. " .macro m0pad;"
  42. " .endm;"
  43. #endif
  44. " .set fl_delay_def, 1;"
  45. " .set fl_delay_mod, 4;"
  46. " .if fl_is_m0p == 1;"
  47. " .set fl_delay_mod, 3;"
  48. " .endif;"
  49. " .macro fl_delay dtime, reg=r0;"
  50. " .if (\\dtime > 0);"
  51. " .set dcycle, (\\dtime / fl_delay_mod);"
  52. " .set dwork, (dcycle * fl_delay_mod);"
  53. " .set drem, (\\dtime - dwork);"
  54. " .rept (drem);"
  55. " nop;"
  56. " .endr;"
  57. " .if dcycle > 0;"
  58. " mov \\reg, #dcycle;"
  59. " delayloop_\\@:;"
  60. " sub \\reg, #1;"
  61. " bne delayloop_\\@;"
  62. " .if fl_is_m0p == 0;"
  63. " nop;"
  64. " .endif;"
  65. " .endif;"
  66. " .endif;"
  67. " .endm;"
  68. " .macro mod_delay dtime,b1,b2,reg;"
  69. " .set adj, (\\b1 + \\b2);"
  70. " .if adj < \\dtime;"
  71. " .set dtime2, (\\dtime - adj);"
  72. " fl_delay dtime2, \\reg;"
  73. " .endif;"
  74. " .endm;"
  75. // check the bit and drop the line low if it isn't set
  76. " .macro qlo4 b,bitmask,port,loff ;"
  77. " lsl \\b, #1 ;"
  78. " bcs skip_\\@ ;"
  79. " str \\bitmask, [\\port, \\loff] ;"
  80. " skip_\\@: ;"
  81. " m0pad;"
  82. " .endm ;"
  83. // set the pin hi or low (determined by the offset passed in )
  84. " .macro qset2 bitmask,port,loff;"
  85. " str \\bitmask, [\\port, \\loff];"
  86. " m0pad;"
  87. " .endm;"
  88. // Load up the next led byte to work with, put it in bn
  89. " .macro loadleds3 leds, bn, rled, scratch;"
  90. " mov \\scratch, \\leds;"
  91. " ldrb \\bn, [\\scratch, \\rled];"
  92. " .endm;"
  93. // check whether or not we should dither
  94. " .macro loaddither7 bn,d,base,rdither;"
  95. " ldrb \\d, [\\base, \\rdither];"
  96. " lsl \\d, #24;" //; shift high for the qadd w/bn
  97. " lsl \\bn, #24;" //; shift high for the qadd w/d
  98. " bne chkskip_\\@;" //; if bn==0, clear d;"
  99. " eor \\d, \\d;" //; clear d;"
  100. " m0pad;"
  101. " chkskip_\\@:;"
  102. " .endm;"
  103. // Do the qadd8 for dithering -- there's two versions of this. The m0 version
  104. // takes advantage of the 3 cycle branch to do two things after the branch,
  105. // while keeping timing constant. The m0+, however, branches in 2 cycles, so
  106. // we have to work around that a bit more. This is one of the few times
  107. // where the m0 will actually be _more_ efficient than the m0+
  108. " .macro dither5 bn,d;"
  109. " .syntax unified;"
  110. " .if fl_is_m0p == 0;"
  111. " adds \\bn, \\d;" // do the add
  112. " bcc dither5_1_\\@;"
  113. " mvns \\bn, \\bn;" // set the low 24bits ot 1's
  114. " lsls \\bn, \\bn, #24;" // move low 8 bits to the high bits
  115. " dither5_1_\\@:;"
  116. " nop;" // nop to keep timing in line
  117. " .else;"
  118. " adds \\bn, \\d;" // do the add"
  119. " bcc dither5_2_\\@;"
  120. " mvns \\bn, \\bn;" // set the low 24bits ot 1's
  121. " dither5_2_\\@:;"
  122. " bcc dither5_3_\\@;"
  123. " lsls \\bn, \\bn, #24;" // move low 8 bits to the high bits
  124. " dither5_3_\\@:;"
  125. " .endif;"
  126. " .syntax divided;"
  127. " .endm;"
  128. // Do our scaling
  129. " .macro scale4 bn, base, scale, scratch;"
  130. " ldr \\scratch, [\\base, \\scale];"
  131. " lsr \\bn, \\bn, #24;" // bring bn back down to its low 8 bits
  132. " mul \\bn, \\scratch;" // do the multiply
  133. " .endm;"
  134. // swap bn into b
  135. " .macro swapbbn1 b,bn;"
  136. " lsl \\b, \\bn, #16;" // put the 8 bits we want for output high
  137. " .endm;"
  138. // adjust the dithering value for the next time around (load e from memory
  139. // to do the math)
  140. " .macro adjdither7 base,d,rled,eoffset,scratch;"
  141. " ldrb \\d, [\\base, \\rled];"
  142. " ldrb \\scratch,[\\base,\\eoffset];" // load e
  143. " .syntax unified;"
  144. " subs \\d, \\scratch, \\d;" // d=e-d
  145. " .syntax divided;"
  146. " strb \\d, [\\base, \\rled];" // save d
  147. " .endm;"
  148. // increment the led pointer (base+6 has what we're incrementing by)
  149. " .macro incleds3 leds, base, scratch;"
  150. " ldrb \\scratch, [\\base, #6];" // load incremen
  151. " add \\leds, \\leds, \\scratch;" // update leds pointer
  152. " .endm;"
  153. // compare and loop
  154. " .macro cmploop5 counter,label;"
  155. " .syntax unified;"
  156. " subs \\counter, #1;"
  157. " .syntax divided;"
  158. " beq done_\\@;"
  159. " m0pad;"
  160. " b \\label;"
  161. " done_\\@:;"
  162. " .endm;"
  163. " .endif;"
  164. );
  165. #define M0_ASM_ARGS : \
  166. [leds] "+h" (leds), \
  167. [counter] "+l" (counter), \
  168. [scratch] "+l" (scratch), \
  169. [d] "+l" (d), \
  170. [bn] "+l" (bn), \
  171. [b] "+l" (b) \
  172. : \
  173. [port] "l" (port), \
  174. [base] "l" (base), \
  175. [bitmask] "l" (bitmask), \
  176. [hi_off] "I" (HI_OFFSET), \
  177. [lo_off] "I" (LO_OFFSET), \
  178. [led0] "I" (RO(0)), \
  179. [led1] "I" (RO(1)), \
  180. [led2] "I" (RO(2)), \
  181. [e0] "I" (3+RO(0)), \
  182. [e1] "I" (3+RO(1)), \
  183. [e2] "I" (3+RO(2)), \
  184. [scale0] "I" (4*(2+RO(0))), \
  185. [scale1] "I" (4*(2+RO(1))), \
  186. [scale2] "I" (4*(2+RO(2))), \
  187. [T1] "I" (T1), \
  188. [T2] "I" (T2), \
  189. [T3] "I" (T3) \
  190. :
  191. /////////////////////////////////////////////////////////////////////////
  192. // now for some convinience macros to make building our lines a bit cleaner
  193. #define LOOP " loop_%=:"
  194. #define HI2 " qset2 %[bitmask], %[port], %[hi_off];"
  195. #define _D1 " mod_delay %c[T1],2,0,%[scratch];"
  196. #define QLO4 " qlo4 %[b],%[bitmask],%[port], %[lo_off];"
  197. #define LOADLEDS3(X) " loadleds3 %[leds], %[bn], %[led" #X "] ,%[scratch];"
  198. #define _D2(ADJ) " mod_delay %c[T2],4," #ADJ ",%[scratch];"
  199. #define LO2 " qset2 %[bitmask], %[port], %[lo_off];"
  200. #define _D3(ADJ) " mod_delay %c[T3],2," #ADJ ",%[scratch];"
  201. #define LOADDITHER7(X) " loaddither7 %[bn], %[d], %[base], %[led" #X "];"
  202. #define DITHER5 " dither5 %[bn], %[d];"
  203. #define SCALE4(X) " scale4 %[bn], %[base], %[scale" #X "], %[scratch];"
  204. #define SWAPBBN1 " swapbbn1 %[b], %[bn];"
  205. #define ADJDITHER7(X) " adjdither7 %[base],%[d],%[led" #X "],%[e" #X "],%[scratch];"
  206. #define INCLEDS3 " incleds3 %[leds],%[base],%[scratch];"
  207. #define CMPLOOP5 " cmploop5 %[counter], loop_%=;"
  208. #define NOTHING ""
  209. #if (defined(SEI_CHK) && (FASTLED_ALLOW_INTERRUPTS == 1))
  210. // We're allowing interrupts and have hardware timer support defined -
  211. // track the loop outside the asm code, to allow inserting the interrupt
  212. // overrun checks.
  213. asm __volatile__ (
  214. // pre-load byte 0
  215. LOADLEDS3(0) LOADDITHER7(0) DITHER5 SCALE4(0) ADJDITHER7(0) SWAPBBN1
  216. M0_ASM_ARGS);
  217. do {
  218. asm __volatile__ (
  219. // Write out byte 0, prepping byte 1
  220. HI2 _D1 QLO4 NOTHING _D2(0) LO2 _D3(0)
  221. HI2 _D1 QLO4 LOADLEDS3(1) _D2(3) LO2 _D3(0)
  222. HI2 _D1 QLO4 LOADDITHER7(1) _D2(7) LO2 _D3(0)
  223. HI2 _D1 QLO4 DITHER5 _D2(5) LO2 _D3(0)
  224. HI2 _D1 QLO4 SCALE4(1) _D2(4) LO2 _D3(0)
  225. HI2 _D1 QLO4 ADJDITHER7(1) _D2(7) LO2 _D3(0)
  226. HI2 _D1 QLO4 NOTHING _D2(0) LO2 _D3(0)
  227. HI2 _D1 QLO4 SWAPBBN1 _D2(1) LO2 _D3(0)
  228. // Write out byte 1, prepping byte 2
  229. HI2 _D1 QLO4 NOTHING _D2(0) LO2 _D3(0)
  230. HI2 _D1 QLO4 LOADLEDS3(2) _D2(3) LO2 _D3(0)
  231. HI2 _D1 QLO4 LOADDITHER7(2) _D2(7) LO2 _D3(0)
  232. HI2 _D1 QLO4 DITHER5 _D2(5) LO2 _D3(0)
  233. HI2 _D1 QLO4 SCALE4(2) _D2(4) LO2 _D3(0)
  234. HI2 _D1 QLO4 ADJDITHER7(2) _D2(7) LO2 _D3(0)
  235. HI2 _D1 QLO4 NOTHING _D2(0) LO2 _D3(0)
  236. HI2 _D1 QLO4 SWAPBBN1 _D2(1) LO2 _D3(0)
  237. // Write out byte 2, prepping byte 0
  238. HI2 _D1 QLO4 INCLEDS3 _D2(3) LO2 _D3(0)
  239. HI2 _D1 QLO4 LOADLEDS3(0) _D2(3) LO2 _D3(0)
  240. HI2 _D1 QLO4 LOADDITHER7(0) _D2(7) LO2 _D3(0)
  241. HI2 _D1 QLO4 DITHER5 _D2(5) LO2 _D3(0)
  242. HI2 _D1 QLO4 SCALE4(0) _D2(4) LO2 _D3(0)
  243. HI2 _D1 QLO4 ADJDITHER7(0) _D2(7) LO2 _D3(0)
  244. HI2 _D1 QLO4 NOTHING _D2(0) LO2 _D3(0)
  245. HI2 _D1 QLO4 SWAPBBN1 _D2(1) LO2 _D3(5)
  246. M0_ASM_ARGS
  247. );
  248. SEI_CHK; INNER_SEI; --counter; CLI_CHK;
  249. } while(counter);
  250. #elif (FASTLED_ALLOW_INTERRUPTS == 1)
  251. // We're allowing interrupts - track the loop outside the asm code, and
  252. // re-enable interrupts in between each iteration.
  253. asm __volatile__ (
  254. // pre-load byte 0
  255. LOADLEDS3(0) LOADDITHER7(0) DITHER5 SCALE4(0) ADJDITHER7(0) SWAPBBN1
  256. M0_ASM_ARGS);
  257. do {
  258. asm __volatile__ (
  259. // Write out byte 0, prepping byte 1
  260. HI2 _D1 QLO4 NOTHING _D2(0) LO2 _D3(0)
  261. HI2 _D1 QLO4 LOADLEDS3(1) _D2(3) LO2 _D3(0)
  262. HI2 _D1 QLO4 LOADDITHER7(1) _D2(7) LO2 _D3(0)
  263. HI2 _D1 QLO4 DITHER5 _D2(5) LO2 _D3(0)
  264. HI2 _D1 QLO4 SCALE4(1) _D2(4) LO2 _D3(0)
  265. HI2 _D1 QLO4 ADJDITHER7(1) _D2(7) LO2 _D3(0)
  266. HI2 _D1 QLO4 NOTHING _D2(0) LO2 _D3(0)
  267. HI2 _D1 QLO4 SWAPBBN1 _D2(1) LO2 _D3(0)
  268. // Write out byte 1, prepping byte 2
  269. HI2 _D1 QLO4 NOTHING _D2(0) LO2 _D3(0)
  270. HI2 _D1 QLO4 LOADLEDS3(2) _D2(3) LO2 _D3(0)
  271. HI2 _D1 QLO4 LOADDITHER7(2) _D2(7) LO2 _D3(0)
  272. HI2 _D1 QLO4 DITHER5 _D2(5) LO2 _D3(0)
  273. HI2 _D1 QLO4 SCALE4(2) _D2(4) LO2 _D3(0)
  274. HI2 _D1 QLO4 ADJDITHER7(2) _D2(7) LO2 _D3(0)
  275. HI2 _D1 QLO4 INCLEDS3 _D2(3) LO2 _D3(0)
  276. HI2 _D1 QLO4 SWAPBBN1 _D2(1) LO2 _D3(0)
  277. // Write out byte 2, prepping byte 0
  278. HI2 _D1 QLO4 NOTHING _D2(0) LO2 _D3(0)
  279. HI2 _D1 QLO4 LOADLEDS3(0) _D2(3) LO2 _D3(0)
  280. HI2 _D1 QLO4 LOADDITHER7(0) _D2(7) LO2 _D3(0)
  281. HI2 _D1 QLO4 DITHER5 _D2(5) LO2 _D3(0)
  282. HI2 _D1 QLO4 SCALE4(0) _D2(4) LO2 _D3(0)
  283. HI2 _D1 QLO4 ADJDITHER7(0) _D2(7) LO2 _D3(0)
  284. HI2 _D1 QLO4 NOTHING _D2(0) LO2 _D3(0)
  285. HI2 _D1 QLO4 SWAPBBN1 _D2(1) LO2 _D3(5)
  286. M0_ASM_ARGS
  287. );
  288. uint32_t ticksBeforeInterrupts = SysTick->VAL;
  289. sei();
  290. --counter;
  291. cli();
  292. // If more than 45 uSecs have elapsed, give up on this frame and start over.
  293. // Note: this isn't completely correct. It's possible that more than one
  294. // millisecond will elapse, and so SysTick->VAL will lap
  295. // ticksBeforeInterrupts.
  296. // Note: ticksBeforeInterrupts DECREASES
  297. const uint32_t kTicksPerMs = VARIANT_MCK / 1000;
  298. const uint32_t kTicksPerUs = kTicksPerMs / 1000;
  299. const uint32_t kTicksIn45us = kTicksPerUs * 45;
  300. const uint32_t currentTicks = SysTick->VAL;
  301. if (ticksBeforeInterrupts < currentTicks) {
  302. // Timer started over
  303. if ((ticksBeforeInterrupts + (kTicksPerMs - currentTicks)) > kTicksIn45us) {
  304. return 0;
  305. }
  306. } else {
  307. if ((ticksBeforeInterrupts - currentTicks) > kTicksIn45us) {
  308. return 0;
  309. }
  310. }
  311. } while(counter);
  312. #else
  313. // We're not allowing interrupts - run the entire loop in asm to keep things
  314. // as tight as possible. In an ideal world, we should be pushing out ws281x
  315. // leds (or other 3-wire leds) with zero gaps between pixels.
  316. asm __volatile__ (
  317. // pre-load byte 0
  318. LOADLEDS3(0) LOADDITHER7(0) DITHER5 SCALE4(0) ADJDITHER7(0) SWAPBBN1
  319. // loop over writing out the data
  320. LOOP
  321. // Write out byte 0, prepping byte 1
  322. HI2 _D1 QLO4 NOTHING _D2(0) LO2 _D3(0)
  323. HI2 _D1 QLO4 LOADLEDS3(1) _D2(3) LO2 _D3(0)
  324. HI2 _D1 QLO4 LOADDITHER7(1) _D2(7) LO2 _D3(0)
  325. HI2 _D1 QLO4 DITHER5 _D2(5) LO2 _D3(0)
  326. HI2 _D1 QLO4 SCALE4(1) _D2(4) LO2 _D3(0)
  327. HI2 _D1 QLO4 ADJDITHER7(1) _D2(7) LO2 _D3(0)
  328. HI2 _D1 QLO4 NOTHING _D2(0) LO2 _D3(0)
  329. HI2 _D1 QLO4 SWAPBBN1 _D2(1) LO2 _D3(0)
  330. // Write out byte 1, prepping byte 2
  331. HI2 _D1 QLO4 NOTHING _D2(0) LO2 _D3(0)
  332. HI2 _D1 QLO4 LOADLEDS3(2) _D2(3) LO2 _D3(0)
  333. HI2 _D1 QLO4 LOADDITHER7(2) _D2(7) LO2 _D3(0)
  334. HI2 _D1 QLO4 DITHER5 _D2(5) LO2 _D3(0)
  335. HI2 _D1 QLO4 SCALE4(2) _D2(4) LO2 _D3(0)
  336. HI2 _D1 QLO4 ADJDITHER7(2) _D2(7) LO2 _D3(0)
  337. HI2 _D1 QLO4 INCLEDS3 _D2(3) LO2 _D3(0)
  338. HI2 _D1 QLO4 SWAPBBN1 _D2(1) LO2 _D3(0)
  339. // Write out byte 2, prepping byte 0
  340. HI2 _D1 QLO4 NOTHING _D2(0) LO2 _D3(0)
  341. HI2 _D1 QLO4 LOADLEDS3(0) _D2(3) LO2 _D3(0)
  342. HI2 _D1 QLO4 LOADDITHER7(0) _D2(7) LO2 _D3(0)
  343. HI2 _D1 QLO4 DITHER5 _D2(5) LO2 _D3(0)
  344. HI2 _D1 QLO4 SCALE4(0) _D2(4) LO2 _D3(0)
  345. HI2 _D1 QLO4 ADJDITHER7(0) _D2(7) LO2 _D3(0)
  346. HI2 _D1 QLO4 NOTHING _D2(0) LO2 _D3(0)
  347. HI2 _D1 QLO4 SWAPBBN1 _D2(1) LO2 _D3(5) CMPLOOP5
  348. M0_ASM_ARGS
  349. );
  350. #endif
  351. return num_leds;
  352. }
  353. #endif