From 979af9a6cbdb8a29ade17d1939ce73feaecbcb51 Mon Sep 17 00:00:00 2001
From: Dave <83719612+DaveJWalker@users.noreply.github.com>
Date: Tue, 10 Sep 2024 18:17:49 -0700
Subject: [PATCH 01/11] Added cordic demo program.

---
 programs/cordic.as    | 410 ++++++++++++++++++++++++++++++++++++++++++
 programs/cordic.mc    | 212 ++++++++++++++++++++++
 programs/cordic.schem | Bin 0 -> 5605 bytes
 3 files changed, 622 insertions(+)
 create mode 100644 programs/cordic.as
 create mode 100644 programs/cordic.mc
 create mode 100644 programs/cordic.schem

diff --git a/programs/cordic.as b/programs/cordic.as
new file mode 100644
index 0000000..cd39d45
--- /dev/null
+++ b/programs/cordic.as
@@ -0,0 +1,410 @@
+// Cordic Demo by Dave Walker
+
+// A basic implementation of a CORDIC function operating in rotation mode.  A CORDIC can be
+// used to iteratively calculate sine and cosine of an angle. Due to the limitations of this
+// 8-bit computer, the CORDIC isn't particularly accurate.  A number of values in this code are
+// represented as fixed point representations.  Therefore, you'll see notations like u2.5 and
+// s1.6.  These notations denote signed/unsigned, the number of integer bits, and the number of
+// fractional bits.  For example, the sine/cosine outputs are all s1.6.
+//
+// In addition to the CORDIC, a draw_line function is included based on Bresenham's Algorithm.
+
+// Memory mapped IO port mapping offsets
+// from memory_mapped_io_addr (248)
+define memory_mapped_io_addr     248
+define pixel_x_offset             -8
+define pixel_y_offset             -7
+define draw_pixel_offset          -6
+define clear_pixel_offset         -5
+define load_pixel_offset          -4
+define buffer_screen_offset       -3
+define clear_screen_buffer_offset -2
+define write_char_offset          -1
+define buffer_chars_offset         0
+define clear_chars_buffer_offset   1
+define show_number_offset          2
+define clear_number_offset         3
+define signed_mode_offset          4
+define unsigned_mode_offset        5
+define rng_offset                  6
+define controller_input_offset     7
+
+// Various RAM addresses
+define x2_coord                    0
+define y2_coord                    1
+define register_stack_pointer      100
+define atan_LUT_strt_addr          232
+
+// Load the arctan LUT into RAM.
+CAL .load_atan_lut
+
+// Clear the screen and number display
+LDI r15 memory_mapped_io_addr
+STR r15 r0 clear_screen_buffer_offset
+STR r15 r0 buffer_screen_offset
+STR r15 r0 unsigned_mode_offset
+STR r15 r0 clear_chars_buffer_offset
+STR r15 r0 buffer_chars_offset
+
+// Write "CORDICDEMO"
+STR r15 r0 clear_chars_buffer_offset
+LDI r14 "C"
+STR r15 r14 write_char_offset
+LDI r14 "O"
+STR r15 r14 write_char_offset
+LDI r14 "R"
+STR r15 r14 write_char_offset
+LDI r14 "D"
+STR r15 r14 write_char_offset
+LDI r14 "I"
+STR r15 r14 write_char_offset
+LDI r14 "C"
+STR r15 r14 write_char_offset
+LDI r14 "D"
+STR r15 r14 write_char_offset
+LDI r14 "E"
+STR r15 r14 write_char_offset
+LDI r14 "M"
+STR r15 r14 write_char_offset
+LDI r14 "O"
+STR r15 r14 write_char_offset
+STR r15 r0 buffer_chars_offset
+
+// Store initial point of circle in RAM
+LDI r15 x2_coord
+LDI r14 31
+STR r15 r14 x2_coord
+LDI r14 16
+STR r15 r14 y2_coord
+
+// Draw a circle using the CORDIC function
+// as a simple demonstration.  The CORDIC
+// is used to generate points on the circle
+// and lines are drawn between each point.
+
+// Go through angles from 0 to 200 (0 to 2*pi radians) 
+LDI r13 0   // Starting angle
+
+.circle_loop
+    MOV r13 r1  // Store the angle in r13 since r1 is modified by the CORDIC function
+
+    // Use the CORDIC to calculate sine and cosine of angle (r1)
+    CAL .cordic
+
+    // Scale sine/cosine values and center on screen
+    // also move them to r3/r4 for use in draw_line function
+    LDI r10 128     // Sign bit mask
+    AND r3 r10 r5   // Grab the sign bit for y
+    RSH r3 r4
+    ADD r4 r5 r4
+    RSH r4 r4
+    ADD r4 r5 r4
+    AND r2 r10 r5   // Grab the sign bit for x
+    RSH r2 r3
+    ADD r3 r5 r3
+    RSH r3 r3
+    ADD r3 r5 r3
+    ADI r3 16
+    ADI r4 16
+    // Grab xy coordinates from previous iteration from RAM
+    LDI r15 x2_coord
+    LOD r15 r1 x2_coord
+    LOD r15 r2 y2_coord
+
+    // Push the r13 angle value to RAM since it gets modified inside the draw_line function
+    LDI r15 register_stack_pointer
+    STR r15 r13
+    CAL .draw_line
+    // And pop it back off when finished
+    LDI r15 register_stack_pointer
+    LOD r15 r13
+
+    // Store the x1/y1 coordinates to RAM so they can be x2/y2 next iteration
+    LDI r15 x2_coord
+    STR r15 r3 x2_coord
+    STR r15 r4 y2_coord
+
+    // Display current angle
+    LDI r15 memory_mapped_io_addr
+    STR r15 r13 show_number_offset
+
+    // Increment the angle and loop
+    ADI r13 10
+    LDI r14 201  // Ending angle
+    CMP r13 r14
+    BRH lt .circle_loop
+HLT
+
+
+.cordic
+// CORDIC function computes sine and cosine of angle.
+// Input:
+//  r1  = angle in radians (fixed point in the form u2.5)
+//        (Values between 0 and 2*pi are supported.)
+// Outputs:
+//  r2  = sine  (r1)
+//  r3  = cosine(r1)
+// Register usage
+//  r1  - angle in radians (s1.6)
+//  r2  - x (s1.6)
+//  r3  - y (s1.6)
+//  r4  - iteration counter (i)
+//  r5  - temp iteration counter/scratch
+//  r6  - shifted x
+//  r7  - shifted y
+//  r8  - total iterations
+//  r9  - holds current iteration atan value
+//  r10 - sign bit mask/scatch
+//  r11 - pointer to arctan table
+//  r12 - quadrant flag (determines whether to negate x and/or y result)
+//
+// The input angle comes in the form u2.5 with a
+// range of 0 to 2*pi. CORDICs only work for
+// +pi/2 to -pi/2 angles.  To keep things simple,
+// we'll only operate in one quadrant of the unit
+// circle (0 to +pi/2). For the other quadrants,
+// we'll modify the angle and outputs appropriately.
+
+    LDI r5 50   // Load +pi/2 (1.5708*2^5 = ~50)
+    CMP r5 r1
+    BRH ge .quadrant_0
+    LDI r5 100  // Load +pi (3.1416*2^5 = ~100)
+    CMP r5 r1
+    BRH ge .quadrant_1
+    LDI r5 150  // Load +3/2*pi (4.712*2^5 = ~150)
+    CMP r5 r1
+    BRH ge .quadrant_2
+    JMP .quadrant_3
+
+    // For each quadrant, set the quadrant flag, which will be used
+    // at the end to negate the sine/cosine outputs accordingly.
+    // Also, adjust the input angle to all calculations are performed
+    // as if in quadrant 0.
+    .quadrant_0
+        LDI r12 0b00    // Set quadrant flag to leave xy untouched
+        JMP .cordic_setup
+    .quadrant_1
+        LDI r12 0b10    // Set quadrant flag to negate x
+        SUB r5 r1 r1
+        JMP .cordic_setup
+    .quadrant_2
+        LDI r12 0b11    // Set quadrant flag to negative x&y
+        LDI r5 100      // Load +pi (3.1416*2^5 = ~100)
+        SUB r1 r5 r1
+        JMP .cordic_setup
+    .quadrant_3
+        LDI r12 0x01    // Set quadrant flag to negate y
+        LDI r5 200      // Load +2*pi (6.2832*2^5 = ~200)
+        SUB r5 r1 r1
+
+    .cordic_setup
+        LSH r1 r1       // adjust input angle from u2.6 to s1.6; this step is needed because negative angles are needed during CORDIC operation
+        LDI r2 38       // x = 0.6072 (s1.6) = ~38/2^6 (this value has scaling factor K pre-applied)
+        LDI r3 0        // y = 0
+        LDI r4 0        // iteration counter (i)
+        LDI r8 7        // total iterations
+        LDI r10 128     // angle sign bit mask
+        LDI r11 atan_LUT_strt_addr  // Point to start of atan LUT
+
+        .cordic_loop
+            // Make temporary copies of the i,x,y values for shifting
+            MOV r4 r5
+            MOV r2 r6
+            MOV r3 r7
+
+            // The current computer ALU does not support arithmetic shifting
+            // with RSH instruction, which presents a problem for negative
+            // numbers.  When shifting a negative value right, ones should
+            // get shifted into the sign bit.  That doesn't happen so
+            // negative shifts aren't handled properly.  To get around this problem,
+            // for now, I'll OR in the sign bit after shifting.
+            AND r6 r10 r14  // Grab the sign bit for x
+            AND r7 r10 r10  // Grab the sign bit for y
+            .shift_loop
+                CMP r5 r0
+                BRH eq .shift_done
+                RSH r6 r6
+                ADD r6 r14 r6   // Add the sign bit after shifting
+                RSH r7 r7
+                ADD r7 r10 r7   // Add the sign bit after shifting
+                DEC r5
+                JMP .shift_loop
+            .shift_done
+            LOD r11 r9 // Load atan value for current iteration
+
+            // Determine rotation direction
+            LDI r10 128   // Sign bit mask
+            AND r1 r10 r0 // Check the sign bit
+            BRH z .positive_rotation
+
+                .negative_rotation // Clockwise
+                ADD r2 r7 r2
+                SUB r3 r6 r3
+                ADD r1 r9 r1
+                JMP .next_iteration
+
+                .positive_rotation // Counter clockwise
+                SUB r2 r7 r2
+                ADD r3 r6 r3
+                SUB r1 r9 r1
+
+            .next_iteration
+            INC r11     // Point to next atan value
+            INC r4      // Increment iteration (i)
+            CMP r4 r8   // Check against total iterations
+            BRH nz .cordic_loop
+
+        // Adjust xy outputs accordingly based on quadrant
+
+        // Adjust x?
+        .check_x_negate
+        LDI r5 0b10
+        AND r12 r5 r0
+        BRH nz .negate_x
+        JMP .check_y_negate
+        .negate_x
+            SUB r0 r2 r2
+
+        .check_y_negate
+        LDI r5 0b01
+        AND r12 r5 r0
+        BRH nz .negate_y
+        JMP .cordic_done
+        .negate_y
+            SUB r0 r3 r3
+    .cordic_done
+     RET
+
+
+.draw_line
+// This function draws a line between two points
+// utilizing Bresenham's Algorithm.  I ported the
+// algorithm to assembly using MattBatWing's python
+// implementation as a guide (a.k.a. I swiped it).
+// Inputs:
+//  r1 = x1
+//  r2 = y1
+//  r3 = x2
+//  r4 = y2
+// Register Usage:
+//  r5 = dx = abs(x2 - x1)
+//  r6 = dy = abs(y2 - y1)
+//  r7 = sx = sign(x2 - x1)
+//  r8 = sy = sign(y2 - y1)
+//  r9 = Error = 2*dy - dx
+//  r10 = scratch
+//  r11 = A = 2*dy
+//  r12 = B = 2*dy - 2*dx
+//  r13 = interchange flag
+
+    // Set sx/sy slope bits to 1 for positive slope (default)
+    LDI r7 1
+    LDI r8 1
+
+    // Calculate x values dx and sx.
+    .calc_x
+        LDI r10 128       // Sign bit mask
+        SUB r3 r1 r5      // x2 - x1
+        AND r5 r10 r0     // sx = sign(x2 - x1)
+        BRH nz .negate_x_dl
+        JMP .calc_y
+        .negate_x_dl
+            SUB r0 r5 r5      // dx = abs(x2 - x1)
+            LDI r7 -1         // sx = -1 (negative slope)
+
+    // Calculate y values dy and sy.
+    .calc_y
+        SUB r4 r2 r6      // y2 - y1
+        AND r6 r10 r0     // sy = sign(y2 - y1)
+        BRH nz .negate_y_dl
+        JMP .calc_interchange
+        .negate_y_dl
+            SUB r0 r6 r6      // dy = abs(y2 - y1)
+            LDI r8 -1         // sy = -1 (negative slope)
+
+    .calc_interchange
+        LDI r13 0         // Set interchange flag to 0 (false)
+        SUB r5 r6 r0      // Is dx or dy is greater?
+        BRH ge .calc_err  // If dx >= dy, proceed to calc error, A and B
+        MOV r5 r10        // If dy < dx, swap dx and dy
+        MOV r6 r5
+        MOV r10 r6
+        LDI r13 1         // and set interchange flag to 1 (true)
+
+    .calc_err
+        LSH r6 r11        // A = 2*dy
+        LSH r5 r12        // 2*dx
+        SUB r0 r12 r12    // -2*dx
+        ADD r11 r12 r12   // B = 2*dy - 2*dx
+        SUB r0 r5 r9      // -dx
+        ADD r11 r9 r9     // Error = 2*dy - dx
+
+    // Draw first pixel
+    LDI r15 memory_mapped_io_addr
+    STR r15 r1 pixel_x_offset
+    STR r15 r2 pixel_y_offset 
+    STR r15 r0 draw_pixel_offset
+
+    LDI r14 0   // Set i to 0 for loop
+    .draw_line_loop
+        LDI r10 128     // Sign bit mask
+        AND r9 r10 r0   // Is Error < 0?
+        BRH z .error_ge_zero
+        .error_lt_zero
+            ADD r9 r11 r9   // Error =+ A
+            CMP r13 r0      // Check interchange flag
+            BRH eq .inc_x   // ... and increment either x or y
+            .inc_y
+                ADD r2 r8 r2    // y =+ s2
+                JMP .draw_pixel
+            .inc_x
+                ADD r1 r7 r1    // x =+ s1
+                JMP .draw_pixel
+        .error_ge_zero
+            ADD r2 r8 r2    // y =+ s2
+            ADD r1 r7 r1    // x =+ s1
+            ADD r9 r12 r9   // Error =+ B
+
+        .draw_pixel
+            // Make sure we're in the range of the screen before
+            // drawing a pixel.
+            LDI r10 32
+            CMP r1 r10
+            BRH ge .next_pixel
+            CMP r2 r10
+            BRH ge .next_pixel
+            STR r15 r1 pixel_x_offset
+            STR r15 r2 pixel_y_offset 
+            STR r15 r0 draw_pixel_offset
+
+        .next_pixel
+            INC r14       // Increment loop counter
+            CMP r14 r5    // Exit loop when i > dx
+            BRH ge .buffer_screen
+            JMP .draw_line_loop
+
+    .buffer_screen
+        STR r15 r0 buffer_screen_offset
+        RET
+
+
+
+// Load the arctangent look-up table into RAM
+// values are in the form S1.6
+.load_atan_lut
+    LDI r15 atan_LUT_strt_addr
+    LDI r14 50     // arctan(2^0)  = ~50/2^6
+    STR r15 r14 0
+    LDI r14 30     // arctan(2^-1) = ~30/2^6
+    STR r15 r14 1
+    LDI r14 16     // arctan(2^-2) = ~16/2^6
+    STR r15 r14 2
+    LDI r14 8      // arctan(2^-3) = ~8/2^6
+    STR r15 r14 3
+    LDI r14 4      // arctan(2^-4) = ~4/2^6
+    STR r15 r14 4
+    LDI r14 2      // arctan(2^-5) = ~2/2^6
+    STR r15 r14 5
+    LDI r14 1      // arctan(2^-6) = ~1/2^6
+    STR r15 r14 6
+    RET
diff --git a/programs/cordic.mc b/programs/cordic.mc
new file mode 100644
index 0000000..f983ad6
--- /dev/null
+++ b/programs/cordic.mc
@@ -0,0 +1,212 @@
+1100000011000100
+1000111111111000
+1111111100001110
+1111111100001101
+1111111100000101
+1111111100000001
+1111111100000000
+1111111100000001
+1000111000000011
+1111111111101111
+1000111000001111
+1111111111101111
+1000111000010010
+1111111111101111
+1000111000000100
+1111111111101111
+1000111000001001
+1111111111101111
+1000111000000011
+1111111111101111
+1000111000000100
+1111111111101111
+1000111000000101
+1111111111101111
+1000111000001101
+1111111111101111
+1000111000001111
+1111111111101111
+1111111100000000
+1000111100000000
+1000111000011111
+1111111111100000
+1000111000010000
+1111111111100001
+1000110100000000
+0010110100000001
+1100000001000100
+1000101010000000
+0101001110100101
+0111001100000100
+0010010001010100
+0111010000000100
+0010010001010100
+0101001010100101
+0111001000000011
+0010001101010011
+0111001100000011
+0010001101010011
+1001001100010000
+1001010000010000
+1000111100000000
+1110111100010000
+1110111100100001
+1000111101100100
+1111111111010000
+1100000010001000
+1000111101100100
+1110111111010000
+1000111100000000
+1111111100110000
+1111111101000001
+1000111111111000
+1111111111010010
+1001110100001010
+1000111011001001
+0011110111100000
+1011110000100011
+0001000000000000
+1000010100110010
+0011010100010000
+1011100001001110
+1000010101100100
+0011010100010000
+1011100001010000
+1000010110010110
+0011010100010000
+1011100001010011
+1010000001010111
+1000110000000000
+1010000001011010
+1000110000000010
+0011010100010001
+1010000001011010
+1000110000000011
+1000010101100100
+0011000101010001
+1010000001011010
+1000110000000001
+1000010111001000
+0011010100010001
+0010000100010001
+1000001000100110
+1000001100000000
+1000010000000000
+1000100000000111
+1000101010000000
+1000101111101000
+0010010000000101
+0010001000000110
+0010001100000111
+0101011010101110
+0101011110101010
+0011010100000000
+1011000001101110
+0111011000000110
+0010011011100110
+0111011100000111
+0010011110100111
+1001010111111111
+1010000001100110
+1110101110010000
+1000101010000000
+0101000110100000
+1011000001110110
+0010001001110010
+0011001101100011
+0010000110010001
+1010000001111001
+0011001001110010
+0010001101100011
+0011000110010001
+1001101100000001
+1001010000000001
+0011010010000000
+1011010001100001
+1000010100000010
+0101110001010000
+1011010010000001
+1010000010000010
+0011000000100010
+1000010100000001
+0101110001010000
+1011010010000110
+1010000010000111
+0011000000110011
+1101000000000000
+1000011100000001
+1000100000000001
+1000101010000000
+0011001100010101
+0101010110100000
+1011010010001111
+1010000010010001
+0011000001010101
+1000011111111111
+0011010000100110
+0101011010100000
+1011010010010101
+1010000010010111
+0011000001100110
+1000100011111111
+1000110100000000
+0011010101100000
+1011100010011110
+0010010100001010
+0010011000000101
+0010101000000110
+1000110100000001
+0010011001101011
+0010010101011100
+0011000011001100
+0010101111001100
+0011000001011001
+0010101110011001
+1000111111111000
+1111111100011000
+1111111100101001
+1111111100001010
+1000111000000000
+1000101010000000
+0101100110100000
+1011000010110011
+0010100110111001
+0011110100000000
+1011000010110001
+0010001010000010
+1010000010110110
+0010000101110001
+1010000010110110
+0010001010000010
+0010000101110001
+0010100111001001
+1000101000100000
+0011000110100000
+1011100010111110
+0011001010100000
+1011100010111110
+1111111100011000
+1111111100101001
+1111111100001010
+1001111000000001
+0011111001010000
+1011100011000010
+1010000010101001
+1111111100001101
+1101000000000000
+1000111111101000
+1000111000110010
+1111111111100000
+1000111000011110
+1111111111100001
+1000111000010000
+1111111111100010
+1000111000001000
+1111111111100011
+1000111000000100
+1111111111100100
+1000111000000010
+1111111111100101
+1000111000000001
+1111111111100110
+1101000000000000
diff --git a/programs/cordic.schem b/programs/cordic.schem
new file mode 100644
index 0000000000000000000000000000000000000000..28caac0383e07c8269e02876ee0cf9d017eb5153
GIT binary patch
literal 5605
zcmb_gXH-*Zx5hFe16WW|KyVy{C`|!L6cA8Qh7wSb8brl_gcza}A(SA)5Oiot2qmGY
z7>YE7&`TTvkr0YhO$b3sq$NNCLTLGddWV_y&HZt|@17rL?Q_;y`#k4)_kN!Jo<mXF
zx1*;A-8TzvBE6xon;JehJ=_o?Y({JBLDPGUv(dLNt?Y6ijbP*C1_BfVrZ>4O1`Rq!
z4{q9)JS!@Zd0x8lEvYKP9R{rp^G9zrd<a7Csz?3|I%{#{4AkC@Rr`3iBiZoDCai7R
z<IfGXyHn~5b*(W?Wb~5jLQmrwcl_hb45*r(lmzOlKM{Sa9uVsDio7_BG~mYp>NDH@
zm!od^hXw)l*Vyd>4xg7)y~+#~tS$)FdS?y$<pm3nCP9GzN`^JYaA~C|^x#-ziN36Y
zfXxZ?4|>HsBZn3YA8IGp_LEPMtN5#&6+%){-?enX(gcf}7d5>$G|GnnP78Ycm-mDE
zK9g%!hQ17askUxEzHr>%zP&wzT{m#mu)1O(gIT*sh{|KkuW^@FN*L^(QO+06-o9&7
zMH<4)fvYz|QQ5wWi~Flvj$fOKG$bz%ANTLO#<?aT0Pfg{4&qO7tov@&W3MsN+{w%k
z^lXuTTQC<IuoxP87q~K#*D}fIqDOsR&Tw%a7^u}vl?+{CP1milzD(vZ$ojRF=)h|F
zG{;5bR((b-YV`}pb@~W%t&10x?Qb4hRvl0&_}s@~@(PjuMNGdhZ4OL!VWC%DNKk%U
z8zuw2sMZH6U*4bL!q8u3a6YzwDQ8X@tS+(W3^wBrUQ}LEwfz?TQ*vt~tzvaG40a*Y
zK+v?1E*NbEMYaiMZZ8gl02Ng$6Y_ZqX_BFSWU+;e)!AA@0cB!2bj>S0vMHlBuqE%p
z0fn_bfrQAP$NjpariZ}6LDctusJb-%ZiG-8ag9{sNo!LB8>MU@N`>zz@hP$s)Yg7<
z-gCrunXLS$%6B%YGGv@`cdN8sA;tj6TPg82IGyeUUG1LNA!d%uH@06mU6pa${(Q&x
z;JQ3TkNnAC)I)zM$1*6W;r(uV=V!5Yoj?o~;;4nxtUuVD=zCHS*^=+CF_mjpnyLv}
zsP*ChxSLq;n?y;Da?46$N|ADba=k=r(HliljM$gN3Y6ZLqrU2=9k4P%M#3C76F8|m
z@_erq{{o>YJR<uib<oRm(Bjo`1CPpzKcV2pa(2O6?XaZx7vvBdr^z!8n>N6zyfQ!d
zxojK4iayH=nMvd4W~}bBpqgBn7?X4{g#~p~POJH0#&7&=4zHi<*zOkN11ujPUJbPm
zTHwM{gT&;SG#&d|0{Q$##_!Qd^HTj6+q8@bEzOo)($ewC8B3L{8n=eK+uY4GezAlP
zEmKuT#q25Z)Ca@*6^RLDaBo*TZ_9vt54(UDaBb%NtebC6?BPk<H+#i6R#?QCbm6{m
zq>`9tUNLp+GW64eyI#ctWwbuhcFk<hSB7QA0BIh7o<<0pHvk#j5AJF-&GR7WWy=`u
zKVvyOs@Wi^+bwfaAcd)Lb=->~-0+~iI1MfJb4tHMEXeBg0(BpYi;jw^8%kbP7jV5>
zJaGlyxJCN5Z8#)tL~Hv->Ti)S|EQ@dtmPdWl2E$u@!mdDaAF1CvZXotAu%=Cf{+l6
zBA4AZ-Va_HscD?*ZjV@%7T!HcO+bmOK2Pm#f9Mzw3woXG9Pt$1{0tpsrgv2-sSJn4
zB2}2atTkoh3#l!#aj!k&|N4a<nc%u(8$brPyaQbg?sl$&T~NQGQmjC^`52~I86L*7
zJI5kjPDvb8P1i%V!o+X4vw%dzxQ1hr{lr*df)oCg;&T|pEi`JtPy5Boyr4oBhirZ8
z57CXx-y%}PI){=h0x+qxr@zLQ0+OVb$(67W63JZ=+4z$l)P0feX22)vC$KNWrav4S
zCb{jz<{(nD5Y+c+SU}CuB%r!=fMwlp&N{q>@&~Ttf^>^Im_^Wi37NYZehOhdNB&Zw
z!dfK++iXBEI)S)grx4l*vy<>ku=`zATrF()_|a>S6d+Ot_bSI1Q*q(wxOP9~XPpKa
zn)4GE;!h2ivzg|J9CLt%m~FDOxp1-F->(6?HA&qm1oO%BA1P1r@}Elfn=TYeZoKAO
z)S=3MjO*coxA2(CXn)^(DzuSk66toe)CmyQM@yR&!{ccf`!u+|+1yltf+Seb`~6Gf
zHWewA`3){~9H+2b+93x^;x(Hp(5e<JM6@R(6%BcbVa?GSRtrWB;7hc!=IPIt<EPH<
z>5Dg)d9r63P)z|{Jx`EGxs7m`-R%JG{9Ct@3Kp*auz@k>&C1Ca!vb(X4$rz>v=q0C
zQGXa=T-<3yP=*uod>?cHe{()JmFw;4UHCL?^zIZqDmo><fb+x{inZ=e)QnQfQwDH0
zeGJ!dURC}mS1NM+MRT-rQSpTErf6HrdmHg(^(m>UHTn8*p7gzCyG*9+BBSF|N615F
z&^)SZPsqQ^i2?G*4R`^^Z%wa6u@f?9$1=6p?BSIgoIW6COe2zngA&7l;9j6T>|vMP
zIgO{krK@Bsf0lI`q6~lNla|qTgyiauy-9I;u6|RxDIcF&Aqw4Db!VOcMRsql$x+tE
z*K|9~?^(FNCF#+3l%D-QKlF|?pv0>~^4=qdmw0wN7)ieQi2j)x`3}r|)9X5Bf0aGb
z``&Qi22vw?ODl@=OueZ*xMt|2Dn{4*v(u#^{bOSLyQ_iMDng_t86Edfq+%^#pEQwK
ztZN%dJE@My8~^nCmT7T47ahHH>5XLHBMz~K7d2)R`}UaW9;U%$&OxErkFAAgE?XZg
z@_n|?Mq}&@&DDJHL&;nb!WEpDk5{u5J+ldM^A|>pA1UT8?TtpxL;OKVUNfsV{#AG>
zG#dXYm7W2Fds`};WV;tSfq@;!;Ve?+@#bF4zC-`kGxWfnc8<Z$3r|VhJ93F{&8)oR
zv8f|R0H>S(77jJ=9|$XI<P)<xs7Knz_afx&iTY#h^jSlFC```X<xOOqcMmnA{P19@
z5n%_gdF6(5D`X#$ukB|-R_cL^$&%Dk%9CwF-*uMdFfj*>Gp(G5dYfOaKE2}ddWu1M
zZ|5i%h}znV+;E()(N+CjhWX1Em#$a0)Bge0cdSoOh=_dHertN`A)>skEGvP8dz2YY
z)509f)<-r8?lgDWZ2**-ae4w4B|z6sM6n9GFX9btOy?lYxqbXT!}h5LTQU>|=|(Cw
zb>!@cyE_T}uy!%GfGxyVmI$2VZqH|KxW(6~D&VKYh7(xR)TzrfMXMJZl%?qOV?SvE
z^hWA=Z<`Z|j4os<$;t^(fc3~>a{2FS{$i#gBHN9n>Ma#~id*o(D`>zWg~jPsoIKh-
z&&pu&b?5?lX>ifi23tiCt_#%1s+Y#>y~C|YQ@tK)F<4JFE~<ubI6(%>ubVOOVqAOv
z80UjuF^xz6asA1BY*lrspOgg)Tn7^@Ll~dSb;kJozslEO_oIf_W>$4WGt-{)3v_3s
z9!fCn8L<LP!gn*)DVOstZd)sh#YXxUt)U@~UWWjEXT7dvbXw^#Nu4~p$|N*x73>cI
zSa0HIu?h^!76sP545r8v-O=>SPh)*M(p6X<L$<hG#JsU|A9?M8%6@@_c%&Lp($~Q$
zBXPbbVV(_!IGWQN<IA9ujxo-#o!k4_=JMk3{U*&StE3(9wt*Wu;&Kk%7Uyo=E7e&5
zWq)g&?1!oQth_JR<p$d@T7|@W6Rap89P&jqoGJ%^6q@m~LVjjmsmt}_&h?t9rGPRT
zCPPE>mRHKQalxB>Em5EC+XaI60qvJgFO_<aUC}fh-B}u=T?$^R<V&~d;e#99Sb8l|
z1Qx1z<@fAvXbNfwW%`t++&ki6N>45%nb^`s(5hlFCoa)dRujuap;USUUV_ecEnHlT
z0;@h)sL{@z1GGV?Yrw+y$S|7SM`o;!FaNelE2M^FUcTM5-}+vD0WPRe9cIi`e2IVl
zw^76cejXx!MDtEOw&;=2(69LW2)nZX8I}tkCG^T$)xk2}u_tZt!8ZIXF1P?YHe7e)
z5I>84wl3G2uzBo$@~IZmO8I$D%%dBf+_S2qgl*C$M3t$K<O4x;wYZgZPi=|v1>ccv
zy?w<OP$T~C#0xoBB8XOJ^UI|`{*9Uo&M=uiVk|!Vvl@=g;DW!;E+hFuJciN7)>-AS
zgc4Dy0~jSxKs^i{Rt#3X?|E>S!DAuXZe3L1<dH-w!Fp((OzsImL$>g<M0Ax^=!KWp
zAX`vTQCgj{??7pbwI6ITDOyMvO?V~QGR(d`(8>L6i?$fH^YQdG3zbo$q&bx8(^-s-
zP${T`%L9&z@u(q%6Yel^)d#64@%}AC$V2BQy>QW!ul>9)ZSz2rki?o;+hWLQ{%sSv
zH~%hngy60lne_+<SwMK?`k?fG<JBWRc~?=m8C(VytYigo(jQb2i)e*Gh=GwDjl=2}
zi@Gn_^n!eGRd5ZauLD^-Xu9z*JPXr{U%5UGH6a2oVi8`KltQeQeM&Fed2d1UjShdU
z<k#l!96ufT&g6Q>m5)DPU)ZYZw>5W*NAU?r8eo0hodDLzw1#*T&#Y!`p}eXc3IB<?
z*Qm)PB0&BZr<P{WndH)a*!`%%U+pL$^)oPwWtXWR9>QRo+<2uCKTM25&Fm;d$^2$p
zZ<yRWP(i{bTEG8PHcZa}`%#ZJf|sCrNk*9?@|2sD;aciU(&em1hs<R=8k5Tdo1RuB
zf&bARm6>y*|8bedN-g;C`B`qX`6M->W>uq0kH+*{?M`JEmyf^C6T>It_?5*M)HB+$
zNPoc9#&LG;HML<q$3tx-4`y|~WzKrpPms=sPJZ90J$qF0#_YQStb3PNmrwz?U_bvD
z)G)L~=NOY@JtC~tQmga<-G@p%7B*mY8tIq8>@})8c0aaROgz|H!4_RN9^~EN`s+t&
zx90`%$cdm=ah-{2=MKC;q}pL`(s@9T^eY_x;2!t85;4v2mrQ)i9;+^3Z*KElG;uUv
zX?STB2(xJ&FMk;1A8ffel0U*)LO2(TMt_PJNsj{Zv$7ZeY1iR(t3uLZ7&>!jeU&G?
z)*Tq)yDn^9RrSpv_MXa+5c99qDikxRkp83*MW(ObJJ3ALMnwK1&~Lm^ZHIWt0sCgK
z+PGo)(J)ZQQe}A7=tP#zMYQxy&J)OvT?mac-tX|`B%Py~zo|L~%ye1=9v<b~knzEn
zouXb%Z6Tplar<L|NK7z#=QnjNdi>wenY)*2H=Gv>U=h6d)tOsLAWbhY8swp4G%Z(_
ziW@y9($hO%le&U;o^iUKI<`^@?eBJqPf$q7MC6x<Mt2D(VBMZehWkA~okHx~4!NJ*
zJ(8Sx(zNwJxm7GLNHbe`xkV)ALm#9^`PTWmW3K<-eSXS~8$N5UNOSeg9=)UE|H5{&
z8ldfpk6)h=DqrgQN6PI4-T5j?;3cW<=jRPHD0{qo^im)Snv;zxsutJ#eaC~<9Y-;T
z?@YrvnCworv5(&AZ=7cRbSnazny{feSzq;hf+M>=Bfu}OtGdv^Jk#`R*DbphFuTNL
zP8g62!i4)8c^mdJ?j~27PNg~RItLu~__WWO$`;qNZL3kIh{T9RTvbw3SGVwXCNtm8
z!?}YsF{kJ8C4tXj1k%jt-sNZdpUfwrTlkD&SzclyKp%4y;iE35M-i)$OfA=h4|IBJ
zRTmg9gWD)#9>&MxUN8OKj)eQh@rnOfI8R-o#0hwuYw~_4?<UW0RhW13x$CCHxe~_F
zNpeq047V#DNI~9EeT=J@6_OZpF?Y@)uw~9(tzfSSb>fwWI;{EW>(22PTF+V(UpDiQ
z0cG<~^@NV{^#K2Oq#jEaPDcIjO;n}b@4uyjuU~L_sFIeG)4i?u2;d9EsqE@Ks9Wjn
zYEp1lIM2>AVQmEs>5vAG5DJS$8|>G&9;C$lPNW&J<gx~WcyMe51yN%9LS&wB!7xIR
z<W>F6lc2x+VGz!YWiAR2l>XV+|EKkfnZaH4?}WCRJ$@u{X16#nJlG}!_`{uNLYFi!
zul_)?QK$Tp_q^b8B6O{P?h)NycUSVI&7x6iSwGt6b{HXWECfW{0RvE)T(VH)C1$KR
zPvmRE!V~jRaD65z%w_9W;*4~@`uYDdRhz^e6$-1=V`SB0&L6>D$*EK+*~-&y18;Rm
zwv>2@>b2$Ng4zB$%{*pjU0#S~QHN&Xk6pz_nsjL@t@HyRBynQ1bd~5<_xl$PIQb70
zV>9-YG+Xr6w3ZmKG1*P4;JRFi6j@qFqvA{SpZ{OJ&DM~>xq!vyTlOI9&;{P)>gbyH
T0(V|qAU+5C_3V+iq9XqSPtQo;

literal 0
HcmV?d00001


From 5402078b97c97f1a6cc888594c2ddc3a3c158cec Mon Sep 17 00:00:00 2001
From: Dave <83719612+DaveJWalker@users.noreply.github.com>
Date: Thu, 12 Sep 2024 21:35:57 -0700
Subject: [PATCH 02/11] Initial version of wireframe program.

---
 programs/wireframe.as | 754 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 754 insertions(+)
 create mode 100644 programs/wireframe.as

diff --git a/programs/wireframe.as b/programs/wireframe.as
new file mode 100644
index 0000000..920d9de
--- /dev/null
+++ b/programs/wireframe.as
@@ -0,0 +1,754 @@
+// Cordic Demo by Dave Walker
+
+// A basic implementation of a CORDIC function operating in rotation mode.  A CORDIC can be
+// used to iteratively calculate sine and cosine of an angle. Due to the limitations of this
+// 8-bit computer, the CORDIC isn't particularly accurate.  A number of values in this code are
+// represented as fixed point representations.  Therefore, you'll see notations like u2.5 and
+// s1.6.  These notations denote signed/unsigned, the number of integer bits, and the number of
+// fractional bits.  For example, the sine/cosine outputs are all s1.6.
+//
+// In addition to the CORDIC, a draw_line function is included based on Bresenham's Algorithm.
+//
+// (Note: I'm not crazy with how I pass inputs to functions in this code using registers.  I think
+// I'd prefer to pass them via memory (i.e. a stack).  I'd also possibly like to dedicate a
+// register as a stack pointer.  I haven't written assembly in years so I'm not accustomed
+// to dealing with this stuff directly.  Perhaps I'll change it later... perhaps not.)
+
+
+// Memory mapped IO port mapping offsets
+// from memory_mapped_io_addr (248)
+define memory_mapped_io_addr     248
+define pixel_x_offset             -8
+define pixel_y_offset             -7
+define draw_pixel_offset          -6
+define clear_pixel_offset         -5
+define load_pixel_offset          -4
+define buffer_screen_offset       -3
+define clear_screen_buffer_offset -2
+define write_char_offset          -1
+define buffer_chars_offset         0
+define clear_chars_buffer_offset   1
+define show_number_offset          2
+define clear_number_offset         3
+define signed_mode_offset          4
+define unsigned_mode_offset        5
+define rng_offset                  6
+define controller_input_offset     7
+
+// Various RAM addresses
+define x2_coord                    0
+define y2_coord                    1
+define register_stack_pointer      100
+define atan_LUT_strt_addr          232
+
+//LDI r1 -8
+//LDI r2 2
+//CAL .mult
+
+
+LDI r2 18
+LDI r1 101
+LDI r4 0
+LDI r3 -45
+CAL .div
+STR r15 r1 4
+STR r15 r2 5
+
+LDI r2 127
+LDI r1 255
+LDI r4 0
+LDI r3 3
+CAL .div
+STR r15 r1 0
+STR r15 r2 1
+
+HLT
+
+LDI r1 0
+LDI r2 81
+LDI r3 3
+CAL .div
+STR r15 r1 2
+STR r15 r2 3
+
+LDI r1 68
+LDI r2 214
+LDI r3 99
+CAL .div
+STR r15 r1 6
+STR r15 r2 7
+
+LDI r1 64
+LDI r2 16
+LDI r3 32
+LDI r4 64
+CAL .pixel_projection
+
+// Load the arctan LUT into RAM.
+CAL .load_atan_lut
+
+// Clear the screen and number display
+LDI r15 memory_mapped_io_addr
+STR r15 r0 clear_screen_buffer_offset
+STR r15 r0 buffer_screen_offset
+STR r15 r0 unsigned_mode_offset
+STR r15 r0 clear_chars_buffer_offset
+STR r15 r0 buffer_chars_offset
+
+// Write "CORDICDEMO"
+STR r15 r0 clear_chars_buffer_offset
+LDI r14 "C"
+STR r15 r14 write_char_offset
+LDI r14 "O"
+STR r15 r14 write_char_offset
+LDI r14 "R"
+STR r15 r14 write_char_offset
+LDI r14 "D"
+STR r15 r14 write_char_offset
+LDI r14 "I"
+STR r15 r14 write_char_offset
+LDI r14 "C"
+STR r15 r14 write_char_offset
+LDI r14 "D"
+STR r15 r14 write_char_offset
+LDI r14 "E"
+STR r15 r14 write_char_offset
+LDI r14 "M"
+STR r15 r14 write_char_offset
+LDI r14 "O"
+STR r15 r14 write_char_offset
+STR r15 r0 buffer_chars_offset
+
+// Store initial point of circle in RAM
+LDI r15 x2_coord
+LDI r14 31
+STR r15 r14 x2_coord
+LDI r14 16
+
+// Draw a circle using the CORDIC function
+// as a simple demonstration.  The CORDIC
+// is used to generate points on the circle
+// and lines are drawn between each point.
+
+// Go through angles from 0 to 200 (0 to 2*pi radians)
+LDI r13 0   // Starting angle
+
+.circle_loop
+    MOV r13 r1  // Store the angle in r13 since r1 is modified by the CORDIC function
+
+    // Use the CORDIC to calculate sine and cosine of angle (r1)
+    CAL .cordic
+
+    // Scale sine/cosine values and center on screen
+    // also move them to r3/r4 for use in draw_line function
+    LDI r10 128     // Sign bit mask
+    AND r3 r10 r5   // Grab the sign bit for y
+    RSH r3 r4
+    ADD r4 r5 r4
+    RSH r4 r4
+    ADD r4 r5 r4
+    AND r2 r10 r5   // Grab the sign bit for x
+    RSH r2 r3
+    ADD r3 r5 r3
+    RSH r3 r3
+    ADD r3 r5 r3
+    ADI r3 16
+    ADI r4 16
+
+    // Grab xy coordinates from previous iteration from RAM
+    LDI r15 x2_coord
+    LOD r15 r1 x2_coord
+    LOD r15 r2 y2_coord
+
+    // Push the r13 angle value to RAM since it gets modified inside the draw_line function
+    LDI r15 register_stack_pointer
+    STR r15 r13
+    CAL .draw_line
+    // And pop it back off when finished
+    LDI r15 register_stack_pointer
+    LOD r15 r13
+
+    // Store the x1/y1 coordinates to RAM so they can be x2/y2 next iteration
+    LDI r15 x2_coord
+    STR r15 r3 x2_coord
+    STR r15 r4 y2_coord
+
+    // Display current angle
+    LDI r15 memory_mapped_io_addr
+    STR r15 r13 show_number_offset
+
+    // Increment the angle and loop
+    ADI r13 10
+    LDI r14 201  // Ending angle
+    CMP r13 r14
+    BRH lt .circle_loop
+    SUB r13 r14 r13
+    JMP .circle_loop
+HLT
+
+
+.cordic
+// CORDIC function computes sine and cosine of angle.
+// Input:
+//  r1  = angle in radians (fixed point in the form u2.5)
+//        (Values between 0 and 2*pi are supported.)
+// Outputs:
+//  r2  = sine  (r1)
+//  r3  = cosine(r1)
+// Register usage
+//  r1  - angle in radians (s1.6)
+//  r2  - x (s1.6)
+//  r3  - y (s1.6)
+//  r4  - iteration counter (i)
+//  r5  - temp iteration counter/scratch
+//  r6  - shifted x
+//  r7  - shifted y
+//  r8  - total iterations
+//  r9  - holds current iteration atan value
+//  r10 - sign bit mask/scatch
+//  r11 - pointer to arctan table
+//  r12 - quadrant flag (determines whether to negate x and/or y result)
+//
+// The input angle comes in the form u2.5 with a
+// range of 0 to 2*pi. CORDICs only work for
+// +pi/2 to -pi/2 angles.  To keep things simple,
+// we'll only operate in one quadrant of the unit
+// circle (0 to +pi/2). For the other quadrants,
+// we'll modify the angle and outputs appropriately.
+
+    LDI r5 50   // Load +pi/2   (1.5708*2^5 = ~50)
+    CMP r5 r1
+    BRH ge .quadrant_0
+    LDI r5 100  // Load +pi     (3.1416*2^5 = ~100)
+    CMP r5 r1
+    BRH ge .quadrant_1
+    LDI r5 150  // Load +3/2*pi (4.7124*2^5 = ~150)
+    CMP r5 r1
+    BRH ge .quadrant_2
+    JMP .quadrant_3
+
+    // For each quadrant, set the quadrant flag, which will be used
+    // at the end to negate the sine/cosine outputs accordingly.
+    // Also, adjust the input angle to all calculations are performed
+    // as if in quadrant 0.
+    .quadrant_0
+        LDI r12 0b00    // Set quadrant flag to leave xy untouched
+        JMP .cordic_setup
+    .quadrant_1
+        LDI r12 0b10    // Set quadrant flag to negate x
+        SUB r5 r1 r1
+        JMP .cordic_setup
+    .quadrant_2
+        LDI r12 0b11    // Set quadrant flag to negate x&y
+        LDI r5 100      // Load +pi (3.1416*2^5 = ~100)
+        SUB r1 r5 r1
+        JMP .cordic_setup
+    .quadrant_3
+        LDI r12 0x01    // Set quadrant flag to negate y
+        LDI r5 200      // Load +2*pi (6.2832*2^5 = ~200)
+        SUB r5 r1 r1
+
+    .cordic_setup
+        LSH r1 r1       // adjust input angle from u2.6 to s1.6; this step is needed because negative angles are needed during CORDIC operation
+        LDI r2 38       // x = 0.6072 (s1.6) = ~38/2^6 (this value has scaling factor K pre-applied)
+        LDI r3 0        // y = 0
+        LDI r4 0        // iteration counter (i)
+        LDI r8 7        // total iterations
+        LDI r10 128     // angle sign bit mask
+        LDI r11 atan_LUT_strt_addr  // Point to start of atan LUT
+
+        .cordic_loop
+            // Make temporary copies of the i,x,y values for shifting
+            MOV r4 r5
+            MOV r2 r6
+            MOV r3 r7
+
+            // The current computer ALU does not support arithmetic shifting
+            // with RSH instruction, which presents a problem for negative
+            // numbers.  When shifting a negative value right, ones should
+            // get shifted into the sign bit.  That doesn't happen so
+            // negative shifts aren't handled properly.  To get around this
+            // problem, I'll ADD in the sign bit after shifting.
+            // Note: The fact that the ALU only supports single bit shifts
+            // necessitates a loop and therefore slows down the CORDIC
+            // significantly.  Multi-bit shifts are certainly possible but
+            // would make the ALU much larger... as always, tradeoffs. :)
+            AND r6 r10 r14  // Grab the sign bit for x
+            AND r7 r10 r10  // Grab the sign bit for y
+            .shift_loop
+                CMP r5 r0
+                BRH eq .shift_done
+                RSH r6 r6
+                ADD r6 r14 r6   // Add the sign bit after shifting
+                RSH r7 r7
+                ADD r7 r10 r7   // Add the sign bit after shifting
+                DEC r5
+                JMP .shift_loop
+            .shift_done
+            LOD r11 r9 // Load atan value for current iteration
+
+            // Determine rotation direction
+            LDI r10 128   // Sign bit mask
+            AND r1 r10 r0 // Check the sign bit
+            BRH z .positive_rotation
+
+                .negative_rotation // Clockwise
+                ADD r2 r7 r2
+                SUB r3 r6 r3
+                ADD r1 r9 r1
+                JMP .next_iteration
+
+                .positive_rotation // Counter clockwise
+                SUB r2 r7 r2
+                ADD r3 r6 r3
+                SUB r1 r9 r1
+
+            .next_iteration
+            INC r11     // Point to next atan value
+            INC r4      // Increment iteration (i)
+            CMP r4 r8   // Check against total iterations
+            BRH nz .cordic_loop
+
+        // Adjust xy outputs accordingly based on quadrant
+
+        // Adjust x?
+        .check_x_negate
+        LDI r5 0b10
+        AND r12 r5 r0
+        BRH nz .negate_x
+        JMP .check_y_negate
+        .negate_x
+            SUB r0 r2 r2
+
+        .check_y_negate
+        LDI r5 0b01
+        AND r12 r5 r0
+        BRH nz .negate_y
+        JMP .cordic_done
+        .negate_y
+            SUB r0 r3 r3
+    .cordic_done
+     RET
+
+
+.draw_line
+// This function draws a line between two points
+// utilizing Bresenham's Algorithm.  I ported the
+// algorithm to assembly using MattBatWing's python
+// implementation as a guide (a.k.a. I swiped it).
+// Inputs:
+//  r1 = x1
+//  r2 = y1
+//  r3 = x2
+//  r4 = y2
+// Register Usage:
+//  r5 = dx = abs(x2 - x1)
+//  r6 = dy = abs(y2 - y1)
+//  r7 = sx = sign(x2 - x1)
+//  r8 = sy = sign(y2 - y1)
+//  r9 = Error = 2*dy - dx
+//  r10 = scratch
+//  r11 = A = 2*dy
+//  r12 = B = 2*dy - 2*dx
+//  r13 = interchange flag
+
+    // Set sx/sy slope bits to 1 for positive slope (default)
+    LDI r7 1
+    LDI r8 1
+
+    // Calculate x values dx and sx.
+    .calc_x
+        LDI r10 128       // Sign bit mask
+        SUB r3 r1 r5      // x2 - x1
+        AND r5 r10 r0     // sx = sign(x2 - x1)
+        BRH nz .negate_x_dl
+        JMP .calc_y
+        .negate_x_dl
+            SUB r0 r5 r5      // dx = abs(x2 - x1)
+            LDI r7 -1         // sx = -1 (negative slope)
+
+    // Calculate y values dy and sy.
+    .calc_y
+        SUB r4 r2 r6      // y2 - y1
+        AND r6 r10 r0     // sy = sign(y2 - y1)
+        BRH nz .negate_y_dl
+        JMP .calc_interchange
+        .negate_y_dl
+            SUB r0 r6 r6      // dy = abs(y2 - y1)
+            LDI r8 -1         // sy = -1 (negative slope)
+
+    .calc_interchange
+        LDI r13 0         // Set interchange flag to 0 (false)
+        SUB r5 r6 r0      // Is dx or dy is greater?
+        BRH ge .calc_err  // If dx >= dy, proceed to calc error, A and B
+        MOV r5 r10        // If dy < dx, swap dx and dy
+        MOV r6 r5
+        MOV r10 r6
+        LDI r13 1         // and set interchange flag to 1 (true)
+
+    .calc_err
+        LSH r6 r11        // A = 2*dy
+        LSH r5 r12        // 2*dx
+        SUB r0 r12 r12    // -2*dx
+        ADD r11 r12 r12   // B = 2*dy - 2*dx
+        SUB r0 r5 r9      // -dx
+        ADD r11 r9 r9     // Error = 2*dy - dx
+
+    // Draw first pixel
+    LDI r15 memory_mapped_io_addr
+    STR r15 r1 pixel_x_offset
+    STR r15 r2 pixel_y_offset
+    STR r15 r0 draw_pixel_offset
+
+    LDI r14 0   // Set i to 0 for loop
+    .draw_line_loop
+        LDI r10 128     // Sign bit mask
+        AND r9 r10 r0   // Is Error < 0?
+        BRH z .error_ge_zero
+        .error_lt_zero
+            ADD r9 r11 r9   // Error =+ A
+            CMP r13 r0      // Check interchange flag
+            BRH eq .inc_x   // ... and increment either x or y
+            .inc_y
+                ADD r2 r8 r2    // y =+ s2
+                JMP .draw_pixel
+            .inc_x
+                ADD r1 r7 r1    // x =+ s1
+                JMP .draw_pixel
+        .error_ge_zero
+            ADD r2 r8 r2    // y =+ s2
+            ADD r1 r7 r1    // x =+ s1
+            ADD r9 r12 r9   // Error =+ B
+
+        .draw_pixel
+            // Make sure we're in the range of the screen before
+            // drawing a pixel.
+            LDI r10 32
+            CMP r1 r10
+            BRH ge .next_pixel
+            CMP r2 r10
+            BRH ge .next_pixel
+            STR r15 r1 pixel_x_offset
+            STR r15 r2 pixel_y_offset
+            STR r15 r0 draw_pixel_offset
+
+        .next_pixel
+            INC r14       // Increment loop counter
+            CMP r14 r5    // Exit loop when i > dx
+            BRH ge .buffer_screen
+            JMP .draw_line_loop
+
+    .buffer_screen
+        STR r15 r0 buffer_screen_offset
+        RET
+
+
+.mult
+// This function multiplies two 8-bit numbers together.  The result is a 16-bit
+// product, which gets stored in two registers.
+// Inputs:
+//    r1 = multiplicand
+//    r2 = multiplier
+// Outputs:
+//    r5:r4 = 16-bit product
+//
+// Register usage:
+//    r3 = Upper bits of multiplicand (as it gets shifted left)
+//    r6 = LSB mask
+//    r7 = carry flag (need separate flag because of oddity with LSH pseudo instruction)
+//    r8 = loop counter
+//    r9 = product sign flag
+
+    // First things first... convert multiplicand and multiplier to positive
+    // values since this routinee doesn't handle negative 2's complement values
+    // properly.
+
+    LDI r3 128        // Sign bit mask
+    LDI r9 0          // Set product sign to 0 (positive)
+    LDI r6 1          // LSB mask
+    AND r1 r3 r0      // Determine if r1 is negative
+    BRH z .r1_pos
+        XOR r9 r6 r9  // Toggle the product sign flag
+        SUB r0 r1 r1  // And negate it
+    .r1_pos
+    AND r2 r3 r0      // Determine if r2 is negative
+    BRH z .r2_pos
+        XOR r9 r6 r9  // Toggle the product sign flag
+        SUB r0 r2 r2  // And negate it
+    .r2_pos
+
+    LDI r3 0          // Clear upper 8-bit of multiplicand (for later shifting)
+    LDI r4 0          // Clear the product registers
+    LDI r5 0
+    LDI r8 8          // Loop counter (8-bits)
+
+    .mult_loop
+        AND r2 r6 r0            // Check least significant bit of multiplier
+        RSH r2 r2               // and shift it to the right by 1
+        BRH zero .mult_no_add   // If least significant bit is 0, skip addition
+        .mult_add
+            ADD r1 r4 r4        // otherwise add multiplicand to product
+            BRH nc .prod_nc
+            ADI r5 1            // And handle carries into the upper 8-bits if needed
+            .prod_nc
+                ADD r3 r5 r5
+        .mult_no_add
+        LDI r7 0                // Initialize carry flag to 0
+        LSH r1 r1               // Shift multiplicand left to prep for next round
+        BRH nc  .multiplicand_nc
+            LDI r7 1            // If a carry occurs, flag it
+        .multiplicand_nc
+        LSH r3 r3               // Now shift the upper 8-bits of the multiplicand
+        ADD r3 r7 r3            // And add back the carry bit
+        DEC r8                  // Decrement the loop counter
+        BRH nz .mult_loop
+
+    CMP r9 r0         // Determine if product sign flag is set
+    BRH z .mult_done
+        LDI r3 0xFF
+        XOR r3 r4 r4  // Negate the product
+        XOR r3 r5 r5
+        INC r4        // And add 1 to low byte after negation (2's complement)
+        BRH nc .mult_done
+        INC r5        // Handle carry into high byte
+    .mult_done
+    RET
+
+
+.div
+// This function divides a 16-bit dividend by an 8-bit divisor.
+// It results in an 8-bit quotient and remainder.
+// Inputs:
+//    r2:r1 = Dividend (numerator)
+//    r4:r3 = Divisor  (denominator)
+// Outputs:
+//    r2:r1 = Quotient
+//    r3    = Remainder
+// Register usage:
+//    r6:r5 = Remainder temp
+//    r8:r7 = Quotient temp (TBD... can use dividend register to be more efficient
+//    r9-10 = scratch
+//    r11   = loop counter
+//    r12   = quotient_sign
+//
+// The algorithm implemented below is detailed in the following
+// video:
+//    www.youtube.com/watch?v=7m6I7_3XdZ8
+//
+    // Check to see if both bytes of divisor are zero; if so,
+    // a divide-by-zero error occurred.  Use of the NOR instruction
+    // here necessitates a comparison to all ones.
+    LDI r8 0xFF
+    NOR r3 r4 r6
+    CMP r8 r6
+    BRH z .div_by_zero
+
+    // This divide algorithm only works for unsigned values.  Therefore,
+    // if the dividend and/or divisor are negative, convert them
+    // to positive numbers and set a flag to convert the results
+    // appropriately at the end.
+    LDI r9 128        // Sign bit mask
+    LDI r12 0         // Set quotient sign to 0 (positive by default)
+    LDI r6 1          // LSB mask
+
+    // abs(Dividend)
+    AND r2 r9 r0      // Check dividend sign bit
+    BRH z .dividend_pos
+        XOR r12 r6 r12 // Toggle the quotient sign flag
+        XOR r8 r1 r1  // Negate the dividend by inverting all bits...
+        XOR r8 r2 r2
+        INC r1        // ...and add 1 to low byte (2's complement)
+        BRH nc .dividend_pos
+        INC r2        // Handle carry into high byte
+    .dividend_pos
+
+    // abs(Divisor)
+    AND r4 r9 r0      // Check divisor sign bit
+    BRH z .divisor_pos
+        XOR r12 r6 r12 // Toggle the quotient sign flag
+        XOR r8 r3 r3  // Negate the divisor by inverting all bits...
+        XOR r8 r4 r4
+        INC r3        // ...and add 1 to low byte (2's complement)
+        BRH nc .divisor_pos
+        INC r4        // Handle carry into high byte
+    .divisor_pos
+
+    // Initialize
+    LDI r5  0      // Clear registers utilized for calculations
+    LDI r6  0
+    LDI r7  0
+    LDI r8  0
+    LDI r9  0      // Carry flag for low byte shifts
+    LDI r10 0      // Carry flag for high byte shifts
+    LDI r11 16     // Initialize loop counter to 16 (TBD)
+
+    .div_loop
+        // Shift dividend left
+        LDI r9 0                // Clear low byte carry flag
+        LSH r1 r1               // Shift dividend low byte left
+        BRH nc .dividend_lh_nc
+            LDI r9 1            // If a carry occurs, flag it
+        .dividend_lh_nc
+        LDI r10 0                // Clear high byte carry flag
+        LSH r2 r2               // Shift dividend high byte left
+        BRH nc .dividend_h_nc
+            LDI r10 1            // If a carry occurs, flag it
+        .dividend_h_nc
+        ADD r2 r9 r2            // And add the carry bit
+
+        // Shift the remainder left with carry out from dividend shift
+        LDI r9 0                // Clear carry flag
+        LSH r5 r5               // Shift remainder low byte left
+        BRH nc .remainder_lh_nc
+            LDI r9 1            // If a carry occurs, flag it
+        .remainder_lh_nc
+        ADD r5 r10 r5            // And add carry from dividend high byte
+        LDI r10 0
+        LSH r6 r6               // Shift remainder high byte left
+        ADD r6 r9 r6            // And add the carry bit
+
+        // Compare the remainder with the divisor to determine if
+        // a subtraction is possible.  First, compare the high
+        // bytes
+        CMP r6 r4
+        BRH lt .div_no_subtract // If remainder high byte < divisor high byte, skip subtract
+        BRH eq .cmp_low         // If equal, compare the low bytes
+        JMP .div_subtract
+
+        // The upper bytes are equal so the lower bytes need to be compared
+        // Note: I've seen code that skips the low byte comparison and proceeds
+        // with subtraction anyway potentially resulting in a negative remainder.
+        // Apparently, this negative remainder handles itself later but I had
+        // trouble getting it to work (and it's confusing).  I'll take a small
+        // performance hit with the low byte comparison to keep things simple.
+        .cmp_low
+        CMP r5 r3
+        BRH lt .div_no_subtract // If remainder low byte < divisor low byte, skip subtract
+
+            // Perform remainder - divisor 16-bit subtraction (emulating SBB)
+            .div_subtract
+            LDI r9 0                // Clear borrow flag
+            SUB r5 r3 r5            // Subtract low bytes
+            BRH c .div_no_borrow
+                LDI r9 1            // If a borrow occurs, flag it
+            .div_no_borrow
+            SUB r6 r4 r6            // Subtract high bytes
+            SUB r6 r9 r6            // Handle borrow
+
+            LDI R9 0
+            INC r7                  // Increment the quotient
+                                    // Note: it'll never carry into the higher byte
+                                    // because there will always be 'room' to add 1 due to
+                                    // the shift from the previous cycle.
+
+        .div_no_subtract
+        // Shift Quotient left (TBD - again... I'll likely get rid of this since I'll use dividend instead but...
+        LDI r9 1                    // Skip quotient shift if final iteration (i.e. i=1)
+        CMP r11 r9
+        BRH z .no_quotient_shift    // Skip quotient shift if final iteration (i.e. i=1)
+        LDI r9 0                    // Clear low byte carry flag
+        LSH r7 r7                   // Shift quotient low byte left
+        BRH nc .quotient_lh_nc
+            LDI r9 1                // If a carry occurs, flag it
+        .quotient_lh_nc
+        LSH r8 r8                   // Shift quotient high byte left
+        ADD r8 r9 r8
+        .no_quotient_shift
+
+        // Next iteration
+        DEC r11                  // Decrement loop counter
+        BRH nz .div_loop
+
+    // Move results to appropriate output registers
+    MOV r8 r2
+    MOV r7 r1
+    MOV r5 r3
+
+    // Determine if the final quotient result should be negative.
+    // (We'll go ahead and leave the remainder as an unsigned value)
+    CMP r12 r0         // Determine if quotient sign flag is set
+    BRH z .div_done_done
+        LDI r9 0xFF
+        XOR r9 r2 r2  // Negate the quotient
+        XOR r9 r1 r1
+        INC r1        // And add 1 to low byte after negation (2's complement)
+        BRH nc .div_done
+        INC r2        // Handle carry into high byte
+    .div_done
+
+    RET
+
+    // Halt when divide by zero encountered
+    .div_by_zero
+        HLT
+
+
+.pixel_projection
+// The following function is used to project a 3D point in space onto a 2D plane.
+// Given a 3D coordinate [x,y,z], it'll return x_projected and y_projected by
+// calculating the following:
+//
+// x_projected = (focal_length * x)/(focal length + z)
+// y_projected = (focal_length * y)/(focal length + z)
+//
+// Inputs:
+// r1 = x
+// r2 = y
+// r3 = z
+// r4 = focal_length
+//
+// Outputs:
+// r1 = x_projected
+// r2 = y_projected
+//
+// Registers:
+// r7:r6 = focal_length + z
+// r9:r8 = focal_length * [x,y]
+
+    LDI r7 0        // TODO - I probably should handle carries here... though it's unlikely
+    ADD r4 r3 r6    // r6 = focal_length + z
+
+    // Push registers on the stack
+    LDI r15 register_stack_pointer
+    STR r15 r1 0
+    STR r15 r2 1
+    STR r15 r3 2
+    STR r15 r4 3
+    STR r15 r6 4
+    STR r15 r7 5
+
+    // The mult function expects multiplier and multiplicand to be in r1 and r2
+    // so move stuff around as required.
+    MOV r4 r2   // Move focal_length to r2
+    CAL .mult
+    // Move results for div function.
+    MOV r5 r2
+    MOV r4 r1
+    LOD r15 r3 4
+    LOD r15 r4 5
+    CAL .div
+    NOP
+
+HLT
+RET
+
+
+// Load the arctangent look-up table into RAM.
+// Values are in the form S1.6
+.load_atan_lut
+    LDI r15 atan_LUT_strt_addr
+    LDI r14 50     // arctan(2^0)  = ~50/2^6
+    STR r15 r14 0
+    LDI r14 30     // arctan(2^-1) = ~30/2^6
+    STR r15 r14 1
+    LDI r14 16     // arctan(2^-2) = ~16/2^6
+    STR r15 r14 2
+    LDI r14 8      // arctan(2^-3) = ~8/2^6
+    STR r15 r14 3
+    LDI r14 4      // arctan(2^-4) = ~4/2^6
+    STR r15 r14 4
+    LDI r14 2      // arctan(2^-5) = ~2/2^6
+    STR r15 r14 5
+    LDI r14 1      // arctan(2^-6) = ~1/2^6
+    STR r15 r14 6
+    RET

From f77d1c057786d587733be43a839baac6a981d085 Mon Sep 17 00:00:00 2001
From: Dave <83719612+DaveJWalker@users.noreply.github.com>
Date: Fri, 13 Sep 2024 13:23:14 -0700
Subject: [PATCH 03/11] Fixed some comments.

---
 programs/cordic.as | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/programs/cordic.as b/programs/cordic.as
index cd39d45..eb244dd 100644
--- a/programs/cordic.as
+++ b/programs/cordic.as
@@ -188,7 +188,7 @@ HLT
         SUB r5 r1 r1
         JMP .cordic_setup
     .quadrant_2
-        LDI r12 0b11    // Set quadrant flag to negative x&y
+        LDI r12 0b11    // Set quadrant flag to negate x&y
         LDI r5 100      // Load +pi (3.1416*2^5 = ~100)
         SUB r1 r5 r1
         JMP .cordic_setup
@@ -216,8 +216,12 @@ HLT
             // with RSH instruction, which presents a problem for negative
             // numbers.  When shifting a negative value right, ones should
             // get shifted into the sign bit.  That doesn't happen so
-            // negative shifts aren't handled properly.  To get around this problem,
-            // for now, I'll OR in the sign bit after shifting.
+            // negative shifts aren't handled properly.  To get around this
+            // problem, I'll ADD in the sign bit after shifting.
+            // Note: The fact that the ALU only supports single bit shifts
+            // necessitates a loop and therefore slows down the CORDIC
+            // significantly.  Multi-bit shifts are certainly possible but
+            // would make the ALU much larger... as always, tradeoffs. :)
             AND r6 r10 r14  // Grab the sign bit for x
             AND r7 r10 r10  // Grab the sign bit for y
             .shift_loop

From 77293481232f53e57248b2807c2b8a04877c562a Mon Sep 17 00:00:00 2001
From: Dave <83719612+DaveJWalker@users.noreply.github.com>
Date: Sat, 14 Sep 2024 23:05:32 -0700
Subject: [PATCH 04/11] Initial wireframe animation working

The code is still quite messy and several optimizations can be made, but I wanted capture the working copy.
---
 programs/wireframe.as | 805 +++++++++++++++++++++++++++++++++---------
 1 file changed, 637 insertions(+), 168 deletions(-)

diff --git a/programs/wireframe.as b/programs/wireframe.as
index 920d9de..2c11a09 100644
--- a/programs/wireframe.as
+++ b/programs/wireframe.as
@@ -1,4 +1,4 @@
-// Cordic Demo by Dave Walker
+// Wireframe Demo by Dave Walker
 
 // A basic implementation of a CORDIC function operating in rotation mode.  A CORDIC can be
 // used to iteratively calculate sine and cosine of an angle. Due to the limitations of this
@@ -38,55 +38,24 @@ define controller_input_offset     7
 // Various RAM addresses
 define x2_coord                    0
 define y2_coord                    1
-define register_stack_pointer      100
+define register_stack_pointer      50
+define shape_vertices_edges_addr   100
+define projected_points_addr       150
 define atan_LUT_strt_addr          232
 
-//LDI r1 -8
-//LDI r2 2
-//CAL .mult
+// Other constants
+define y_axis                      0
+define x_axis                      1
+define z_axis                      2
+define focal_length               127
 
 
-LDI r2 18
-LDI r1 101
-LDI r4 0
-LDI r3 -45
-CAL .div
-STR r15 r1 4
-STR r15 r2 5
-
-LDI r2 127
-LDI r1 255
-LDI r4 0
-LDI r3 3
-CAL .div
-STR r15 r1 0
-STR r15 r2 1
-
-HLT
-
-LDI r1 0
-LDI r2 81
-LDI r3 3
-CAL .div
-STR r15 r1 2
-STR r15 r2 3
-
-LDI r1 68
-LDI r2 214
-LDI r3 99
-CAL .div
-STR r15 r1 6
-STR r15 r2 7
-
-LDI r1 64
-LDI r2 16
-LDI r3 32
-LDI r4 64
-CAL .pixel_projection
-
 // Load the arctan LUT into RAM.
 CAL .load_atan_lut
 
+// Load shape data into RAM.
+CAL .load_shape_vertices_edges
+
 // Clear the screen and number display
 LDI r15 memory_mapped_io_addr
 STR r15 r0 clear_screen_buffer_offset
@@ -95,95 +64,119 @@ STR r15 r0 unsigned_mode_offset
 STR r15 r0 clear_chars_buffer_offset
 STR r15 r0 buffer_chars_offset
 
-// Write "CORDICDEMO"
+// Write "3DROTATION"
 STR r15 r0 clear_chars_buffer_offset
-LDI r14 "C"
-STR r15 r14 write_char_offset
-LDI r14 "O"
+LDI r14 " "
 STR r15 r14 write_char_offset
 LDI r14 "R"
 STR r15 r14 write_char_offset
-LDI r14 "D"
-STR r15 r14 write_char_offset
-LDI r14 "I"
+LDI r14 "O"
 STR r15 r14 write_char_offset
-LDI r14 "C"
+LDI r14 "T"
 STR r15 r14 write_char_offset
-LDI r14 "D"
+LDI r14 "A"
 STR r15 r14 write_char_offset
-LDI r14 "E"
+LDI r14 "T"
 STR r15 r14 write_char_offset
-LDI r14 "M"
+LDI r14 "I"
 STR r15 r14 write_char_offset
 LDI r14 "O"
 STR r15 r14 write_char_offset
+LDI r14 "N"
+STR r15 r14 write_char_offset
+LDI r14 " "
+STR r15 r14 write_char_offset
 STR r15 r0 buffer_chars_offset
 
-// Store initial point of circle in RAM
-LDI r15 x2_coord
-LDI r14 31
-STR r15 r14 x2_coord
-LDI r14 16
 
-// Draw a circle using the CORDIC function
-// as a simple demonstration.  The CORDIC
-// is used to generate points on the circle
-// and lines are drawn between each point.
+// Initialze rotation angle
+LDI r1 28
+.main_loop
+
+    // Point to 3D share vertice/edge table
+    LDI r15 shape_vertices_edges_addr
+    LOD r15 r14 0   // Load number of vertices in r14
+    INC r15         // And point to first vertice
+
+    // Load 2D projected points table address
+    LDI r13 projected_points_addr
+
+    // Now loop through all of the 3D vertices in memory to
+    // rotate and project them onto a 2D plane for display.
+    .vertice_loop
+        LOD r15 r2 0
+        LOD r15 r3 1
+        LOD r15 r4 2
+        ADI r15 3
+        LDI r5 y_axis      // Rotation axis
+
+        LDI r12 0
+        STR r12 r1  0 // Store angle in RAM
+        STR r12 r14 1 // Store number of vertices RAM
+        STR r12 r15 2 // Store vertice pointer in RAM
+        // TODO: Change rotation function so it doesn't call CORDIC
+        // i.e. move CORDIC call outside of vertice loop since I don't
+        // need to calculate it again each time
+        CAL .rotation
+        LDI r7 focal_length
+        CAL .pixel_projection
+        ADI r1 16
+        ADI r2 16
+
+        LDI r15 memory_mapped_io_addr
+        STR r15 r1 pixel_x_offset
+        STR r15 r2 pixel_y_offset
+        STR r15 r0 draw_pixel_offset
+
+        LDI r12 0
+        LOD r12 r1  0 // Store angle in RAM
+        LOD r12 r14 1 // Store number of vertices RAM
+        LOD r12 r15 2 // Store vertice pointer in RAM
+
+        DEC r14
+        BRH nz .vertice_loop
+
+    LDI r12 0
+    STR r12 r1  0 // Store angle in RAM
+    STR r12 r14 1 // Store number of vertices RAM
+    STR r12 r15 2 // Store vertice pointer in RAM
 
-// Go through angles from 0 to 200 (0 to 2*pi radians)
-LDI r13 0   // Starting angle
+    LDI r15 memory_mapped_io_addr
+    STR r15 r0 buffer_screen_offset
+    STR r15 r0 clear_screen_buffer_offset
 
-.circle_loop
-    MOV r13 r1  // Store the angle in r13 since r1 is modified by the CORDIC function
+    LDI r12 0
+    LOD r12 r1  0 // Store angle in RAM
+    LOD r12 r14 1 // Store number of vertices RAM
+    LOD r12 r15 2 // Store vertice pointer in RAM
 
-    // Use the CORDIC to calculate sine and cosine of angle (r1)
-    CAL .cordic
 
-    // Scale sine/cosine values and center on screen
-    // also move them to r3/r4 for use in draw_line function
-    LDI r10 128     // Sign bit mask
-    AND r3 r10 r5   // Grab the sign bit for y
-    RSH r3 r4
-    ADD r4 r5 r4
-    RSH r4 r4
-    ADD r4 r5 r4
-    AND r2 r10 r5   // Grab the sign bit for x
-    RSH r2 r3
-    ADD r3 r5 r3
-    RSH r3 r3
-    ADD r3 r5 r3
-    ADI r3 16
-    ADI r4 16
-
-    // Grab xy coordinates from previous iteration from RAM
-    LDI r15 x2_coord
-    LOD r15 r1 x2_coord
-    LOD r15 r2 y2_coord
-
-    // Push the r13 angle value to RAM since it gets modified inside the draw_line function
-    LDI r15 register_stack_pointer
-    STR r15 r13
-    CAL .draw_line
-    // And pop it back off when finished
-    LDI r15 register_stack_pointer
-    LOD r15 r13
 
-    // Store the x1/y1 coordinates to RAM so they can be x2/y2 next iteration
-    LDI r15 x2_coord
-    STR r15 r3 x2_coord
-    STR r15 r4 y2_coord
+
+//    // Push the r13 angle value to RAM since it gets modified inside the draw_line function
+//    LDI r15 register_stack_pointer
+//    STR r15 r13
+//    CAL .draw_line
+//    // And pop it back off when finished
+//    LDI r15 register_stack_pointer
+//    LOD r15 r13
+
+//    // Store the x1/y1 coordinates to RAM so they can be x2/y2 next iteration
+//    LDI r15 x2_coord
+//    STR r15 r3 x2_coord
+//    STR r15 r4 y2_coord
 
     // Display current angle
     LDI r15 memory_mapped_io_addr
-    STR r15 r13 show_number_offset
+    STR r15 r1 show_number_offset
 
     // Increment the angle and loop
-    ADI r13 10
+    ADI r1 1
     LDI r14 201  // Ending angle
-    CMP r13 r14
-    BRH lt .circle_loop
-    SUB r13 r14 r13
-    JMP .circle_loop
+    CMP r1 r14
+    BRH lt .main_loop
+    SUB r1 r14 r1
+    JMP .main_loop
 HLT
 
 
@@ -208,7 +201,7 @@ HLT
 //  r10 - sign bit mask/scatch
 //  r11 - pointer to arctan table
 //  r12 - quadrant flag (determines whether to negate x and/or y result)
-//
+//  r13 - scratch
 // The input angle comes in the form u2.5 with a
 // range of 0 to 2*pi. CORDICs only work for
 // +pi/2 to -pi/2 angles.  To keep things simple,
@@ -273,13 +266,13 @@ HLT
             // necessitates a loop and therefore slows down the CORDIC
             // significantly.  Multi-bit shifts are certainly possible but
             // would make the ALU much larger... as always, tradeoffs. :)
-            AND r6 r10 r14  // Grab the sign bit for x
+            AND r6 r10 r13  // Grab the sign bit for x
             AND r7 r10 r10  // Grab the sign bit for y
             .shift_loop
                 CMP r5 r0
                 BRH eq .shift_done
                 RSH r6 r6
-                ADD r6 r14 r6   // Add the sign bit after shifting
+                ADD r6 r13 r6   // Add the sign bit after shifting
                 RSH r7 r7
                 ADD r7 r10 r7   // Add the sign bit after shifting
                 DEC r5
@@ -444,16 +437,15 @@ HLT
 
 
 .mult
-// This function multiplies two 8-bit numbers together.  The result is a 16-bit
-// product, which gets stored in two registers.
+// This function multiplies a 16-bit multiplicand with an
+// 8-bit multiplier resulting in a 16-bit product.
 // Inputs:
-//    r1 = multiplicand
-//    r2 = multiplier
+//    r2:r1 = multiplicand
+//    r3 = multiplier
 // Outputs:
 //    r5:r4 = 16-bit product
 //
 // Register usage:
-//    r3 = Upper bits of multiplicand (as it gets shifted left)
 //    r6 = LSB mask
 //    r7 = carry flag (need separate flag because of oddity with LSH pseudo instruction)
 //    r8 = loop counter
@@ -462,52 +454,59 @@ HLT
     // First things first... convert multiplicand and multiplier to positive
     // values since this routinee doesn't handle negative 2's complement values
     // properly.
+// (TODO... add flag for signed vs unsigned operation)
 
-    LDI r3 128        // Sign bit mask
+    LDI r4 128        // Sign bit mask
+    LDI r5 0xFF       // All ones mask 
     LDI r9 0          // Set product sign to 0 (positive)
     LDI r6 1          // LSB mask
-    AND r1 r3 r0      // Determine if r1 is negative
-    BRH z .r1_pos
+    AND r2 r4 r0      // Determine if r2 is negative
+    BRH z .multiplicand_pos
         XOR r9 r6 r9  // Toggle the product sign flag
-        SUB r0 r1 r1  // And negate it
-    .r1_pos
-    AND r2 r3 r0      // Determine if r2 is negative
-    BRH z .r2_pos
+        XOR r5 r2 r2  // Invert all bits
+        XOR r5 r1 r1
+        INC r1        // And add 1 to low byte after negation (2's complement)
+        BRH nc .multiplicand_pos
+            INC r2    // Handle carry into high byte
+    .multiplicand_pos
+    AND r3 r4 r0      // Determine if r3 is negative
+    BRH z .multiplier_pos
         XOR r9 r6 r9  // Toggle the product sign flag
-        SUB r0 r2 r2  // And negate it
-    .r2_pos
+        SUB r0 r3 r3  // And negate it
+    .multiplier_pos
 
-    LDI r3 0          // Clear upper 8-bit of multiplicand (for later shifting)
     LDI r4 0          // Clear the product registers
     LDI r5 0
-    LDI r8 8          // Loop counter (8-bits)
+    LDI r8 8          // Initialize loop counter
 
     .mult_loop
-        AND r2 r6 r0            // Check least significant bit of multiplier
-        RSH r2 r2               // and shift it to the right by 1
+        AND r3 r6 r0            // Check least significant bit of multiplier
+        RSH r3 r3               // and shift it to the right by 1
         BRH zero .mult_no_add   // If least significant bit is 0, skip addition
         .mult_add
             ADD r1 r4 r4        // otherwise add multiplicand to product
             BRH nc .prod_nc
             ADI r5 1            // And handle carries into the upper 8-bits if needed
             .prod_nc
-                ADD r3 r5 r5
+                ADD r2 r5 r5
         .mult_no_add
         LDI r7 0                // Initialize carry flag to 0
         LSH r1 r1               // Shift multiplicand left to prep for next round
         BRH nc  .multiplicand_nc
             LDI r7 1            // If a carry occurs, flag it
         .multiplicand_nc
-        LSH r3 r3               // Now shift the upper 8-bits of the multiplicand
-        ADD r3 r7 r3            // And add back the carry bit
+        LSH r2 r2               // Now shift the upper 8-bits of the multiplicand
+        ADD r2 r7 r2            // And add back the carry bit
         DEC r8                  // Decrement the loop counter
         BRH nz .mult_loop
 
+    // TODO - add logic to handle multiplicand saturation
+
     CMP r9 r0         // Determine if product sign flag is set
     BRH z .mult_done
-        LDI r3 0xFF
-        XOR r3 r4 r4  // Negate the product
-        XOR r3 r5 r5
+        LDI r2 0xFF
+        XOR r2 r4 r4  // Negate the product
+        XOR r2 r5 r5
         INC r4        // And add 1 to low byte after negation (2's complement)
         BRH nc .mult_done
         INC r5        // Handle carry into high byte
@@ -516,8 +515,8 @@ HLT
 
 
 .div
-// This function divides a 16-bit dividend by an 8-bit divisor.
-// It results in an 8-bit quotient and remainder.
+// This function divides a 16-bit dividend by a 16-bit divisor.
+// It results in an 16-bit quotient and 8-bit remainder.
 // Inputs:
 //    r2:r1 = Dividend (numerator)
 //    r4:r3 = Divisor  (denominator)
@@ -526,14 +525,32 @@ HLT
 //    r3    = Remainder
 // Register usage:
 //    r6:r5 = Remainder temp
-//    r8:r7 = Quotient temp (TBD... can use dividend register to be more efficient
+//    r8:r7 = Quotient temp (TODO... can use dividend register to use less resources)
 //    r9-10 = scratch
 //    r11   = loop counter
 //    r12   = quotient_sign
 //
-// The algorithm implemented below is detailed in the following
-// video:
+// This code implements a non-restoring division algorithm, which is detailed in the
+// following video:
 //    www.youtube.com/watch?v=7m6I7_3XdZ8
+//
+// Below is a rough block diagram:
+//
+//        +---------------------+
+//        |   16-bit Divisor    | (r4:r3)
+//        +---------------------+
+//                |
+//                |    +------------+
+//                +--->|     ALU    |
+//    +--------------->| (subtract) |
+//    |                +------------+
+//    |                    |       
+//    |                    |       
+//    |   +-----------------------------------------------+
+//    +---| Remainder Reg (r6:r5) | Dividend Reg (r2:r1)  |<-- Quotient (shifted in)
+//        +-----------------------------------------------+
+//                       <-- shifted left  
+//        
 //
     // Check to see if both bytes of divisor are zero; if so,
     // a divide-by-zero error occurred.  Use of the NOR instruction
@@ -580,7 +597,7 @@ HLT
     LDI r8  0
     LDI r9  0      // Carry flag for low byte shifts
     LDI r10 0      // Carry flag for high byte shifts
-    LDI r11 16     // Initialize loop counter to 16 (TBD)
+    LDI r11 16     // Initialize loop counter to 16
 
     .div_loop
         // Shift dividend left
@@ -667,7 +684,7 @@ HLT
     // Determine if the final quotient result should be negative.
     // (We'll go ahead and leave the remainder as an unsigned value)
     CMP r12 r0         // Determine if quotient sign flag is set
-    BRH z .div_done_done
+    BRH z .div_done
         LDI r9 0xFF
         XOR r9 r2 r2  // Negate the quotient
         XOR r9 r1 r1
@@ -685,6 +702,7 @@ HLT
 
 .pixel_projection
 // The following function is used to project a 3D point in space onto a 2D plane.
+//
 // Given a 3D coordinate [x,y,z], it'll return x_projected and y_projected by
 // calculating the following:
 //
@@ -692,45 +710,392 @@ HLT
 // y_projected = (focal_length * y)/(focal length + z)
 //
 // Inputs:
-// r1 = x
-// r2 = y
-// r3 = z
-// r4 = focal_length
+//    r2:r1 = x
+//    r4:r3 = y
+//    r6:r5 = z
+//    r7 = focal_length
 //
 // Outputs:
-// r1 = x_projected
-// r2 = y_projected
+//    r1 = x_projected
+//    r2 = y_projected
 //
 // Registers:
-// r7:r6 = focal_length + z
-// r9:r8 = focal_length * [x,y]
-
-    LDI r7 0        // TODO - I probably should handle carries here... though it's unlikely
-    ADD r4 r3 r6    // r6 = focal_length + z
+//    r9:r8 = focal_length + z
+//    r14 = stack pointer
+//    r15 = stack pointer
+
+    // RAM offsets for temporary RAM storage (from r15)
+    define pp_x_low           -8  // x_low
+    define pp_x_high          -7  // x_high
+    define pp_y_low           -6  // y_low
+    define pp_y_high          -5  // y_high
+    define pp_z_low           -4  // z_low
+    define pp_z_high          -3  // z_high
+    define pp_x_projected     -2  // x projected
+    define pp_y_projected     -1  // y projected
+    define pp_fl               0   // focal_length
+    define pp_fl_plus_z_low    1   // focal_length + z (low)
+    define pp_fl_plus_z_high   2   // focal_length + z (high)
+
+    MOV r6 r9       // Move z high byte to r9
+    ADD r7 r5 r8    // r8 = focal_length + z
+    BRH nc .fl_z_nc  // Handle carries into upper byte (TODO)
+        INC r9
+    .fl_z_nc
 
     // Push registers on the stack
     LDI r15 register_stack_pointer
-    STR r15 r1 0
-    STR r15 r2 1
-    STR r15 r3 2
-    STR r15 r4 3
-    STR r15 r6 4
-    STR r15 r7 5
-
-    // The mult function expects multiplier and multiplicand to be in r1 and r2
-    // so move stuff around as required.
-    MOV r4 r2   // Move focal_length to r2
+    ADI r15 8
+    STR r15 r1 pp_x_low
+    STR r15 r2 pp_x_high
+    STR r15 r3 pp_y_low
+    STR r15 r4 pp_y_high
+    STR r15 r7 pp_fl
+    STR r15 r8 pp_fl_plus_z_low
+    STR r15 r9 pp_fl_plus_z_high
+
+    // Calculate x_projected
+    LOD r15 r3 pp_fl
     CAL .mult
-    // Move results for div function.
     MOV r5 r2
     MOV r4 r1
-    LOD r15 r3 4
-    LOD r15 r4 5
+    LOD r15 r3 pp_fl_plus_z_low
+    LOD r15 r4 pp_fl_plus_z_high
     CAL .div
-    NOP
 
-HLT
-RET
+    // Trunc the results
+    LDI r3 2
+    CAL .trunc
+    STR r15 r1 pp_x_projected
+
+    // Calculate y_projected
+    LOD r15 r1 pp_y_low
+    LOD r15 r2 pp_y_high
+    LOD r15 r3 pp_fl
+    CAL .mult
+    MOV r5 r2
+    MOV r4 r1
+    LOD r15 r3 pp_fl_plus_z_low
+    LOD r15 r4 pp_fl_plus_z_high
+    CAL .div
+
+    // Now truncate the results
+    LDI r3 2
+    CAL .trunc
+    STR r15 r1 pp_y_projected
+
+    // Move x_projected and y_projected to output registers
+    MOV r1 r2                   // y_projected -> r2
+    LOD r15 r1 pp_x_projected  // x_projected -> r1
+    RET
+
+
+.rotation
+// This function applies the following rotation matrix to
+// a set of two coordinates:
+//
+//           +-               -+
+//           | cos(A)  -sin(A) |
+//      Rz = |                 |
+//           | sin(A)   cos(A) |
+//           +-               -+
+//
+// Given a 3D coordinate (composed of xyz values), the
+// coordinate pair utilized can be selected based on the
+// coordinate pair selection input as follows:
+//    0 = [x,y]
+//    1 = [x,z]
+//    2 = [y,z]
+//
+// Inputs:
+//    r1  = rotation angle in radians (fixed point in the form u2.5)
+//          (Values between 0 and 2*pi are supported.)  // TODO... make input and outputs consistent
+//    r2 = x
+//    r3 = y
+//    r4 = z
+//    r5 = rotation axis
+//
+// Outputs:   
+//    r2:r1 = rotated_x
+//    r4:r3 = rotated_y
+//    r6:r5 = rotated_z
+// Registers:
+//    r1 - r7 = scratch
+//    r14 = stack pointer (offset 8 from r15)
+//    r15 = stack pointer
+
+    // RAM offsets for temporary storage (offset from r15)
+    define rot_coord1          0 // 1st rotation coordinate
+    define rot_coord2          1 // 2nd rotation coordinate
+    define rot_coord3          2 // 3rd rotation coordinate (fixed)
+    define rot_axis_sel        3 // rotation axis
+    define rot_cosine_high     4 // Cosine
+    define rot_cosine_low      5
+    define rot_sine_high       6 // Sine
+    define rot_sine_low        7
+
+    define coord1_x_cosine_low  -1
+    define coord1_x_cosine_high -2
+    define coord2_x_sine_low    -3
+    define coord2_x_sine_high   -4
+    define coord1_x_sine_low    -5
+    define coord1_x_sine_high   -6
+    define coord2_x_cosine_low  -7
+    define coord2_x_cosine_high -8
+
+    // RAM offset for temp storage (offset from r14)
+    define rot_x_high       -8
+    define rot_x_low        -7
+    define rot_y_high       -6
+    define rot_y_low        -5
+    define rot_z_high       -4
+    define rot_z_low        -3
+    define rot_coord1_high  -2
+    define rot_coord1_low   -1
+    define rot_coord2_high   0
+    define rot_coord2_low    1
+    define rot_coord3_high   2
+    define rot_coord3_low    3
+
+    // Set up pointer for temporary storage
+    LDI r15 register_stack_pointer
+    ADI r15 8
+    MOV r15 r14
+    ADI r14 8
+    ADI r14 8
+
+    // Organize input coordinates in RAM according to the
+    // rotation axis selection input (r5)
+    STR r15 r5 rot_axis_sel
+    LDI r6 x_axis
+    CMP r5 r6
+    BRH z .go_x_axis_sel
+    LDI r6 z_axis
+    CMP r5 r6
+    BRH z .go_z_axis_sel
+    .go_y_axis_sel  // Default
+        STR r15 r2 rot_coord1 
+        STR r15 r4 rot_coord2
+        STR r15 r3 rot_coord3 
+        JMP .calc_cordic
+    .go_z_axis_sel
+        STR r15 r2 rot_coord1 
+        STR r15 r3 rot_coord2
+        STR r15 r4 rot_coord3
+        JMP .calc_cordic
+    .go_x_axis_sel
+        STR r15 r3 rot_coord1 
+        STR r15 r4 rot_coord2
+        STR r15 r2 rot_coord3
+
+    .calc_cordic
+    // Call the CORDIC function to calculate sine and cosine for the
+    // rotation angle
+    CAL .cordic
+
+    // Convert them to 16-bit values for the multiplications below.
+    LDI r12 128
+    LDI r1 0
+    AND r12 r2 r0     // Check sign bit
+    BRH z .cosine_pos
+        LDI r1 0xFF
+    .cosine_pos
+    STR r15 r1 rot_cosine_high 
+    STR r15 r2 rot_cosine_low
+    LDI r1 0
+    AND r12 r3 r0     // Check sign bit
+    BRH z .sine_pos
+        LDI r1 0xFF
+    .sine_pos
+    STR r15 r1 rot_sine_high 
+    STR r15 r3 rot_sine_low
+
+    // Now... calculate the four terms in the rotation and store in RAM
+    // Calculate coord1 * cosine
+    LOD r15 r1 rot_cosine_low
+    LOD r15 r2 rot_cosine_high
+    LOD r15 r3 rot_coord1
+    CAL .mult
+    STR r15 r4 coord1_x_cosine_low
+    STR r15 r5 coord1_x_cosine_high
+
+    // Calculate coord1 * sine
+    LOD r15 r1 rot_sine_low
+    LOD r15 r2 rot_sine_high
+    LOD r15 r3 rot_coord1
+    CAL .mult
+    STR r15 r4 coord1_x_sine_low
+    STR r15 r5 coord1_x_sine_high
+
+    // Calculate coord2 * sine
+    LOD r15 r1 rot_sine_low
+    LOD r15 r2 rot_sine_high
+    LOD r15 r3 rot_coord2
+    CAL .mult
+    STR r15 r4 coord2_x_sine_low
+    STR r15 r5 coord2_x_sine_high
+
+    // Calculate coord2 * cosine
+    LOD r15 r1 rot_cosine_low
+    LOD r15 r2 rot_cosine_high
+    LOD r15 r3 rot_coord2
+    CAL .mult
+    STR r15 r4 coord2_x_cosine_low
+    STR r15 r5 coord2_x_cosine_high
+
+    // Calculate coord1_rotation = coord1*cosine(A) +/- coord2*sine(A)
+    LOD r15 r3 coord1_x_cosine_low
+    LOD r15 r4 coord1_x_cosine_high
+    LOD r15 r5 coord2_x_sine_low
+    LOD r15 r6 coord2_x_sine_high
+
+    // Perform 16-bit addition/subtraction (emulating SBB)
+    // Addition/subtraction depends on the axis of rotation TODO better comments
+    LOD r15 r8 rot_axis_sel   // Restore axis selection
+    LDI r9 y_axis
+    CMP r8 r9
+    BRH z .coord1_add
+    .coord1_sub           // For rotations around axis x & z, subtraction is performed
+        LDI r7 0
+        SUB r3 r5 r3      // Subtract low bytes
+        BRH c .rotx_no_borrow
+            LDI r7 1
+        .rotx_no_borrow
+        SUB r4 r6 r4      // Subtract high bytes
+        SUB r4 r7 r4      // Handle borrow
+        JMP .coord1_trunc
+    .coord1_add           // For rotations around y, the two terms are added together
+        LDI r7 0
+        ADD r3 r5 r3      // Add low bytes
+        BRH nc .rotx_no_carry
+            LDI r7 1
+        .rotx_no_carry
+        ADD r4 r6 r4      // Add the high bytes
+        ADD r4 r7 r4      // Handle carry
+
+    .coord1_trunc
+    // Move and truncate the results (TODO remove truncation here)
+    MOV r3 r1
+    MOV r4 r2
+    LDI r3 6
+    CAL .trunc
+    STR r14 r1 rot_coord1_low
+    STR r14 r2 rot_coord1_high
+
+    // Calculate coord2_rotation = coord2*cosine(A) +- coord1*sine(A)
+    LOD r15 r3 coord1_x_sine_low
+    LOD r15 r4 coord1_x_sine_high
+    LOD r15 r5 coord2_x_cosine_low
+    LOD r15 r6 coord2_x_cosine_high
+
+    // Perform 16-bit addition/subtraction TODO better comments
+    LOD r15 r8 rot_axis_sel   // Restore axis selection
+    LDI r9 y_axis
+    CMP r8 r9
+    BRH z .coord2_sub
+    .coord2_add     // For rotations around axis x & z, adding is performed
+        LDI r7 0
+        ADD r3 r5 r3      // Add low bytes
+        BRH nc .roty_no_carry
+            LDI r7 1
+        .roty_no_carry
+        ADD r4 r6 r4      // Add high bytes
+        ADD r4 r7 r4      // And handle carry
+        JMP .coord2_trunc
+    .coord2_sub           // For rotations around axis y, subtraction is performed
+        LDI r7 0
+        SUB r5 r3 r3      // Subtract low bytes
+        BRH c .roty_no_borrow
+            LDI r7 1
+        .roty_no_borrow
+        SUB r6 r4 r4      // Subtract high bytes
+        SUB r4 r7 r4      // Handle borrow
+    .coord2_trunc
+
+    // Move and truncate the results (TODO remove truncation here)
+    MOV r3 r1
+    MOV r4 r2
+    LDI r3 6
+    CAL .trunc
+    STR r14 r1 rot_coord2_low
+    STR r14 r2 rot_coord2_high
+
+    // Multiply the fixed coordinate by 2^6 to match bit growth of
+    // rotated coordinates TODO FIX
+//    LDI r1 64
+    LDI r1 1
+    LDI r2 0
+    LOD r15 r3 rot_coord3
+    CAL .mult
+    STR r14 r4 rot_coord3_low
+    STR r14 r5 rot_coord3_high
+
+    // Organize outputs coordinates based on the rotation axis selection.
+    LOD r15 r5 rot_axis_sel   // Restore axis selection
+    LDI r6 y_axis
+    CMP r5 r6
+    BRH z .rstr_xz_sel
+    LDI r6 x_axis
+    CMP r5 r6
+    BRH z .rstr_yz_sel
+    .rstr_x_axis // Default selection
+        LOD r14 r2 rot_coord1_high
+        LOD r14 r1 rot_coord1_low
+        LOD r14 r4 rot_coord2_high
+        LOD r14 r3 rot_coord2_low
+        LOD r14 r6 rot_coord3_high 
+        LOD r14 r5 rot_coord3_low 
+        JMP .rot_exit
+    .rstr_xz_sel
+        LOD r14 r2 rot_coord1_high
+        LOD r14 r1 rot_coord1_low
+        LOD r14 r4 rot_coord3_high 
+        LOD r14 r3 rot_coord3_low 
+        LOD r14 r6 rot_coord2_high
+        LOD r14 r5 rot_coord2_low
+        JMP .rot_exit
+    .rstr_yz_sel
+        LOD r14 r2 rot_coord2_high
+        LOD r14 r1 rot_coord2_low
+        LOD r14 r6 rot_coord3_high
+        LOD r14 r5 rot_coord3_low
+        LOD r14 r4 rot_coord1_high 
+        LOD r14 r3 rot_coord1_low 
+    .rot_exit
+    RET
+
+
+.trunc
+// The following function performs simple truncation of a
+// 16-bit input value.  The number of bits truncated off
+// is configurable.
+// Inputs:
+//    r2:r1 = 16-bit input
+//    r3    = Number of bits to truncate
+// Registers:
+//    r4    = Bit shifted from high to low
+//    r5    = Sign bit
+//    r6    = scratch
+//    TODO consider adding rounding to this logic and change name from trunc to round if so
+
+    LDI r6 128
+    AND r2 r6 r5        // Grab the sign bit
+    LDI r6 1
+    .trunc_loop
+        CMP r3 r0
+        BRH eq .trunc_done
+        AND r2 r6 r4    // Grab bit shifted from high to low byte
+        RSH r2 r2
+        ADD r2 r5 r2    // Add the sign bit after shifting
+        RSH r1 r1
+        CMP r4 r6       // Did a '1' move from high to low byte?
+        BRH ne .next_trunc_iteration
+            ADI r1 128  // If so, add it back
+        .next_trunc_iteration
+        DEC r3
+        JMP .trunc_loop
+    .trunc_done
+    RET
 
 
 // Load the arctangent look-up table into RAM.
@@ -752,3 +1117,107 @@ RET
     LDI r14 1      // arctan(2^-6) = ~1/2^6
     STR r15 r14 6
     RET
+
+.load_shape_vertices_edges
+    // Square pyramid shape
+    LDI r15 shape_vertices_edges_addr
+    LDI r14 8     // Number of vertices
+    STR r15 r14 0 
+    LDI r14 32    // x0
+    STR r15 r14 1
+    LDI r14 32    // y0
+    STR r15 r14 2
+    LDI r14 32   // z0
+    STR r15 r14 3
+    LDI r14 32   // x1
+    STR r15 r14 4
+    LDI r14 32    // y1
+    STR r15 r14 5
+    LDI r14 -32    // z1
+    STR r15 r14 6
+    LDI r14 32   // x2
+    STR r15 r14 7
+
+    ADI r15 8
+    LDI r14 -32   // y2
+    STR r15 r14 0
+    LDI r14 32    // z2
+    STR r15 r14 1
+    LDI r14 32    // x3
+    STR r15 r14 2
+    LDI r14 -32   // y3
+    STR r15 r14 3
+    LDI r14 -32    // z3
+    STR r15 r14 4
+    LDI r14 -32     // x4
+    STR r15 r14 5
+    LDI r14 32     // y4
+    STR r15 r14 6
+    LDI r14 32  // z4
+    STR r15 r14 7
+
+    ADI r15 8
+    LDI r14 -32   // x5
+    STR r15 r14 0
+    LDI r14 32    // y5
+    STR r15 r14 1
+    LDI r14 -32    // z5
+    STR r15 r14 2
+    LDI r14 -32   // x6
+    STR r15 r14 3
+    LDI r14 -32    // y6
+    STR r15 r14 4
+    LDI r14 32     // z6
+    STR r15 r14 5
+    LDI r14 -32     // x7
+    STR r15 r14 6
+    LDI r14 -32  // y7
+    STR r15 r14 7
+
+    ADI r15 8
+    LDI r14 -32  // z7
+    STR r15 r14 0
+
+//
+//
+//
+//    ADI r15 8
+//    LDI r14 8     // Number of edges
+//    STR r15 r14 0
+//    LDI r14 0     // Edge 0 (vertices 0,1)
+//    STR r15 r14 1
+//    LDI r14 1
+//    STR r15 r14 2
+//    LDI r14 1     // Edge 1 (vertices 1,2)
+//    STR r15 r14 3
+//    LDI r14 2
+//    STR r15 r14 4
+//    LDI r14 2     // Edge 2 (vertices 2,3)
+//    STR r15 r14 5
+//    LDI r14 3
+//    STR r15 r14 6
+//    LDI r14 3     // Edge 3 (vertices 3,0)
+//    STR r15 r14 7
+//
+//    ADI r15 8
+//    LDI r14 0
+//    STR r15 r14 0
+//    LDI r14 0     // Edge 4 (vertices 0,4)
+//    STR r15 r14 1
+//    LDI r14 4
+//    STR r15 r14 2
+//    LDI r14 1     // Edge 5 (vertices 1,4)
+//    STR r15 r14 3
+//    LDI r14 4
+//    STR r15 r14 4
+//    LDI r14 2     // Edge 6 (vertices 2,4)
+//    STR r15 r14 5
+//    LDI r14 4
+//    STR r15 r14 6
+//    LDI r14 3     // Edge 7 (vertices 3,4)
+//    STR r15 r14 7
+//
+//    ADI r15 8
+//    LDI r14 4
+//    STR r15 r14 0
+    RET

From b5ab06b81fb699b63ac0b5a1facf754e9fd0b778 Mon Sep 17 00:00:00 2001
From: Dave <83719612+DaveJWalker@users.noreply.github.com>
Date: Sun, 15 Sep 2024 13:06:30 -0700
Subject: [PATCH 05/11] Added edge line draw functionality.

The code still requires a lot of cleanup and some optimizations.  In particular, I need to move the CORDIC function call outside of the vertice loop to improve performance.
---
 programs/wireframe.as | 333 +++++++++++++++++++++++++++---------------
 1 file changed, 215 insertions(+), 118 deletions(-)

diff --git a/programs/wireframe.as b/programs/wireframe.as
index 2c11a09..ca42932 100644
--- a/programs/wireframe.as
+++ b/programs/wireframe.as
@@ -47,7 +47,7 @@ define atan_LUT_strt_addr          232
 define y_axis                      0
 define x_axis                      1
 define z_axis                      2
-define focal_length               127
+define focal_length                127
 
 
 // Load the arctan LUT into RAM.
@@ -110,10 +110,12 @@ LDI r1 28
         ADI r15 3
         LDI r5 y_axis      // Rotation axis
 
-        LDI r12 0
-        STR r12 r1  0 // Store angle in RAM
-        STR r12 r14 1 // Store number of vertices RAM
-        STR r12 r15 2 // Store vertice pointer in RAM
+// TODO define all of these offsets as constants
+        STR r0 r1  0 // Store angle in RAM
+        STR r0 r14 1 // Store number of vertices RAM
+        STR r0 r15 2 // Store vertice pointer in RAM
+        STR r0 r13 3 // Store the projected points pointer in RAM
+
         // TODO: Change rotation function so it doesn't call CORDIC
         // i.e. move CORDIC call outside of vertice loop since I don't
         // need to calculate it again each time
@@ -123,34 +125,83 @@ LDI r1 28
         ADI r1 16
         ADI r2 16
 
-        LDI r15 memory_mapped_io_addr
-        STR r15 r1 pixel_x_offset
-        STR r15 r2 pixel_y_offset
-        STR r15 r0 draw_pixel_offset
+        // Store the projected points in RAM
+        LDI r12 0
+        LOD r12 r13 3
+        STR r13 r1 0
+        STR r13 r2 1
+        ADI r13 2
+        STR r12 r13 3
+
+        // TODO get rid of this...
+//        LDI r15 memory_mapped_io_addr
+//        STR r15 r1 pixel_x_offset
+//        STR r15 r2 pixel_y_offset
+//        STR r15 r0 draw_pixel_offset
+//        STR r15 r0 buffer_screen_offset
 
         LDI r12 0
-        LOD r12 r1  0 // Store angle in RAM
-        LOD r12 r14 1 // Store number of vertices RAM
-        LOD r12 r15 2 // Store vertice pointer in RAM
+        LOD r12 r1  0 // Recall angle from RAM
+        LOD r12 r14 1 // Restore number of vertices from RAM
+        LOD r12 r15 2 // Restore vertice pointer from RAM
 
         DEC r14
         BRH nz .vertice_loop
 
-    LDI r12 0
-    STR r12 r1  0 // Store angle in RAM
-    STR r12 r14 1 // Store number of vertices RAM
-    STR r12 r15 2 // Store vertice pointer in RAM
-
-    LDI r15 memory_mapped_io_addr
-    STR r15 r0 buffer_screen_offset
-    STR r15 r0 clear_screen_buffer_offset
-
-    LDI r12 0
-    LOD r12 r1  0 // Store angle in RAM
-    LOD r12 r14 1 // Store number of vertices RAM
-    LOD r12 r15 2 // Store vertice pointer in RAM
-
+    // Now loop through all shape edges to draw lines
+    // First grab the number of edges
+    // r15 - pointer to shape table
+    // r6:r5 - vertice pair for each edge
+    // r12 - pointer to projected points table
+    // r13 - number of vertices
+    // r11 - number of projected points left
+    // r10 - Index into projected points table
+    // r14 - Number of shape edges
+    LOD r15 r14 0
+    INC r15
+    .edge_loop
+
+//CAL .wait_for_user
+        // Load vertice pair index for edge
+        LOD r15 r5 0
+        LOD r15 r6 1
+        ADI r15 2
+        LOD r0 r13 1 // Get number of vertices
+        STR r0 r15 2
+        STR r0 r14 7
+        LDI r11 2    // Keep track of number of projected points to grab
+        LDI r10 0
+        LDI r12 projected_points_addr
+        .get_projected_xy
+            CMP r5 r10
+            BRH eq .store_projected_x0y0
+            CMP r6 r10
+            BRH eq .store_projected_x1y1
+            JMP .next_projected_xy
+            .store_projected_x0y0
+                LOD r12 r1 0
+                LOD r12 r2 1
+                DEC r11
+                JMP .next_projected_xy
+            .store_projected_x1y1
+                LOD r12 r3 0
+                LOD r12 r4 1
+                DEC r11
+            .next_projected_xy
+            ADI r12 2
+            INC r10
+            CMP r11 r0
+            BRH nz .get_projected_xy
+
+            CAL .draw_line
+//          LDI r12 0
+            LOD r0 r15 2  // TODO... everywhere I use zero as an address needs r0
+            LOD r0 r14 7
+        DEC r14
+        BRH nz .edge_loop
 
+        LDI r12 0
+        LOD r12 r15 2
 
 
 //    // Push the r13 angle value to RAM since it gets modified inside the draw_line function
@@ -169,8 +220,13 @@ LDI r1 28
     // Display current angle
     LDI r15 memory_mapped_io_addr
     STR r15 r1 show_number_offset
+    STR r15 r0 buffer_screen_offset
+    STR r15 r0 clear_screen_buffer_offset
+
+//CAL .wait_for_user
 
     // Increment the angle and loop
+    LOD r0 r1  0 // Load angle
     ADI r1 1
     LDI r14 201  // Ending angle
     CMP r1 r14
@@ -180,6 +236,18 @@ LDI r1 28
 HLT
 
 
+.wait_for_user
+// This function waits until the user presses one of the controller inputs.
+// Since the current VM doesn't have breakpoints, I use this function to
+// effectively add breakpoints to the code.
+    LDI r10 memory_mapped_io_addr
+    .wait_for_user_loop
+        LOD r10 r9 controller_input_offset
+        CMP r9 r0
+        BRH eq .wait_for_user_loop
+    RET
+
+
 .cordic
 // CORDIC function computes sine and cosine of angle.
 // Input:
@@ -303,8 +371,6 @@ HLT
             BRH nz .cordic_loop
 
         // Adjust xy outputs accordingly based on quadrant
-
-        // Adjust x?
         .check_x_negate
         LDI r5 0b10
         AND r12 r5 r0
@@ -428,12 +494,11 @@ HLT
         .next_pixel
             INC r14       // Increment loop counter
             CMP r14 r5    // Exit loop when i > dx
-            BRH ge .buffer_screen
+            BRH ge .draw_line_exit
             JMP .draw_line_loop
 
-    .buffer_screen
-        STR r15 r0 buffer_screen_offset
-        RET
+    .draw_line_exit
+    RET
 
 
 .mult
@@ -457,7 +522,7 @@ HLT
 // (TODO... add flag for signed vs unsigned operation)
 
     LDI r4 128        // Sign bit mask
-    LDI r5 0xFF       // All ones mask 
+    LDI r5 0xFF       // All ones mask
     LDI r9 0          // Set product sign to 0 (positive)
     LDI r6 1          // LSB mask
     AND r2 r4 r0      // Determine if r2 is negative
@@ -544,13 +609,13 @@ HLT
 //                +--->|     ALU    |
 //    +--------------->| (subtract) |
 //    |                +------------+
-//    |                    |       
-//    |                    |       
+//    |                    |
+//    |                    |
 //    |   +-----------------------------------------------+
 //    +---| Remainder Reg (r6:r5) | Dividend Reg (r2:r1)  |<-- Quotient (shifted in)
 //        +-----------------------------------------------+
-//                       <-- shifted left  
-//        
+//                       <-- shifted left
+//
 //
     // Check to see if both bytes of divisor are zero; if so,
     // a divide-by-zero error occurred.  Use of the NOR instruction
@@ -763,9 +828,9 @@ HLT
     LOD r15 r4 pp_fl_plus_z_high
     CAL .div
 
-    // Trunc the results
+    // round the results
     LDI r3 2
-    CAL .trunc
+    CAL .round
     STR r15 r1 pp_x_projected
 
     // Calculate y_projected
@@ -779,9 +844,9 @@ HLT
     LOD r15 r4 pp_fl_plus_z_high
     CAL .div
 
-    // Now truncate the results
+    // Now round the results
     LDI r3 2
-    CAL .trunc
+    CAL .round
     STR r15 r1 pp_y_projected
 
     // Move x_projected and y_projected to output registers
@@ -815,7 +880,7 @@ HLT
 //    r4 = z
 //    r5 = rotation axis
 //
-// Outputs:   
+// Outputs:
 //    r2:r1 = rotated_x
 //    r4:r3 = rotated_y
 //    r6:r5 = rotated_z
@@ -874,17 +939,17 @@ HLT
     CMP r5 r6
     BRH z .go_z_axis_sel
     .go_y_axis_sel  // Default
-        STR r15 r2 rot_coord1 
+        STR r15 r2 rot_coord1
         STR r15 r4 rot_coord2
-        STR r15 r3 rot_coord3 
+        STR r15 r3 rot_coord3
         JMP .calc_cordic
     .go_z_axis_sel
-        STR r15 r2 rot_coord1 
+        STR r15 r2 rot_coord1
         STR r15 r3 rot_coord2
         STR r15 r4 rot_coord3
         JMP .calc_cordic
     .go_x_axis_sel
-        STR r15 r3 rot_coord1 
+        STR r15 r3 rot_coord1
         STR r15 r4 rot_coord2
         STR r15 r2 rot_coord3
 
@@ -900,14 +965,14 @@ HLT
     BRH z .cosine_pos
         LDI r1 0xFF
     .cosine_pos
-    STR r15 r1 rot_cosine_high 
+    STR r15 r1 rot_cosine_high
     STR r15 r2 rot_cosine_low
     LDI r1 0
     AND r12 r3 r0     // Check sign bit
     BRH z .sine_pos
         LDI r1 0xFF
     .sine_pos
-    STR r15 r1 rot_sine_high 
+    STR r15 r1 rot_sine_high
     STR r15 r3 rot_sine_low
 
     // Now... calculate the four terms in the rotation and store in RAM
@@ -963,7 +1028,7 @@ HLT
         .rotx_no_borrow
         SUB r4 r6 r4      // Subtract high bytes
         SUB r4 r7 r4      // Handle borrow
-        JMP .coord1_trunc
+        JMP .coord1_round
     .coord1_add           // For rotations around y, the two terms are added together
         LDI r7 0
         ADD r3 r5 r3      // Add low bytes
@@ -973,12 +1038,12 @@ HLT
         ADD r4 r6 r4      // Add the high bytes
         ADD r4 r7 r4      // Handle carry
 
-    .coord1_trunc
-    // Move and truncate the results (TODO remove truncation here)
+    .coord1_round
+    // Move and round the results
     MOV r3 r1
     MOV r4 r2
     LDI r3 6
-    CAL .trunc
+    CAL .round
     STR r14 r1 rot_coord1_low
     STR r14 r2 rot_coord1_high
 
@@ -1001,7 +1066,7 @@ HLT
         .roty_no_carry
         ADD r4 r6 r4      // Add high bytes
         ADD r4 r7 r4      // And handle carry
-        JMP .coord2_trunc
+        JMP .coord2_round
     .coord2_sub           // For rotations around axis y, subtraction is performed
         LDI r7 0
         SUB r5 r3 r3      // Subtract low bytes
@@ -1010,13 +1075,13 @@ HLT
         .roty_no_borrow
         SUB r6 r4 r4      // Subtract high bytes
         SUB r4 r7 r4      // Handle borrow
-    .coord2_trunc
+    .coord2_round
 
-    // Move and truncate the results (TODO remove truncation here)
+    // Move and round the results
     MOV r3 r1
     MOV r4 r2
     LDI r3 6
-    CAL .trunc
+    CAL .round
     STR r14 r1 rot_coord2_low
     STR r14 r2 rot_coord2_high
 
@@ -1043,14 +1108,14 @@ HLT
         LOD r14 r1 rot_coord1_low
         LOD r14 r4 rot_coord2_high
         LOD r14 r3 rot_coord2_low
-        LOD r14 r6 rot_coord3_high 
-        LOD r14 r5 rot_coord3_low 
+        LOD r14 r6 rot_coord3_high
+        LOD r14 r5 rot_coord3_low
         JMP .rot_exit
     .rstr_xz_sel
         LOD r14 r2 rot_coord1_high
         LOD r14 r1 rot_coord1_low
-        LOD r14 r4 rot_coord3_high 
-        LOD r14 r3 rot_coord3_low 
+        LOD r14 r4 rot_coord3_high
+        LOD r14 r3 rot_coord3_low
         LOD r14 r6 rot_coord2_high
         LOD r14 r5 rot_coord2_low
         JMP .rot_exit
@@ -1059,42 +1124,61 @@ HLT
         LOD r14 r1 rot_coord2_low
         LOD r14 r6 rot_coord3_high
         LOD r14 r5 rot_coord3_low
-        LOD r14 r4 rot_coord1_high 
-        LOD r14 r3 rot_coord1_low 
+        LOD r14 r4 rot_coord1_high
+        LOD r14 r3 rot_coord1_low
     .rot_exit
     RET
 
 
-.trunc
-// The following function performs simple truncation of a
-// 16-bit input value.  The number of bits truncated off
-// is configurable.
+.round
+// The following function performs rounding of the 16-bit input value.
+// Round is done by right shifting the value for one minus the total
+// number of bits to drop (held in r3).  Then, 1 is added/subtracted to
+// the value before truncating off the final bit, which effectively rounds
+// to the nearest integer.
 // Inputs:
 //    r2:r1 = 16-bit input
-//    r3    = Number of bits to truncate
+//    r3    = Number of bits to round off
 // Registers:
 //    r4    = Bit shifted from high to low
 //    r5    = Sign bit
-//    r6    = scratch
-//    TODO consider adding rounding to this logic and change name from trunc to round if so
+//    r7:r6 = scratch
 
     LDI r6 128
     AND r2 r6 r5        // Grab the sign bit
     LDI r6 1
-    .trunc_loop
+    .round_loop
         CMP r3 r0
-        BRH eq .trunc_done
+        BRH eq .round_done
+        CMP r3 r6
+        BRH ne .round_shift
+        .round_final_bit
+            CMP r5 r6
+            BRH eq .round_neg
+            .round_pos
+                INC r1
+                BRH nc .round_no_carry
+                    INC r2
+                .round_no_carry
+                JMP .round_shift
+            .round_neg
+                DEC r1
+                BRH c .round_no_borrow
+                    DEC r2
+                .round_no_borrow
+                JMP .round_shift
+        .round_shift
         AND r2 r6 r4    // Grab bit shifted from high to low byte
         RSH r2 r2
         ADD r2 r5 r2    // Add the sign bit after shifting
         RSH r1 r1
         CMP r4 r6       // Did a '1' move from high to low byte?
-        BRH ne .next_trunc_iteration
+        BRH ne .next_round_iteration
             ADI r1 128  // If so, add it back
-        .next_trunc_iteration
+        .next_round_iteration
         DEC r3
-        JMP .trunc_loop
-    .trunc_done
+        JMP .round_loop
+    .round_done
     RET
 
 
@@ -1122,7 +1206,7 @@ HLT
     // Square pyramid shape
     LDI r15 shape_vertices_edges_addr
     LDI r14 8     // Number of vertices
-    STR r15 r14 0 
+    STR r15 r14 0
     LDI r14 32    // x0
     STR r15 r14 1
     LDI r14 32    // y0
@@ -1177,47 +1261,60 @@ HLT
     ADI r15 8
     LDI r14 -32  // z7
     STR r15 r14 0
+    LDI r14 12     // Number of edges
+    STR r15 r14 1
+    LDI r14 0     // Edge 0 (vertices 0,1)
+    STR r15 r14 2
+    LDI r14 1
+    STR r15 r14 3
+    LDI r14 1     // Edge 1 (vertices 1,3)
+    STR r15 r14 4
+    LDI r14 3
+    STR r15 r14 5
+    LDI r14 2     // Edge 2 (vertices 2,3)
+    STR r15 r14 6
+    LDI r14 3
+    STR r15 r14 7
+
+    ADI r15 8
+    LDI r14 2     // Edge 3 (vertices 2,0)
+    STR r15 r14 0
+    LDI r14 0
+    STR r15 r14 1
+    LDI r14 4     // Edge 4 (vertices 4,5)
+    STR r15 r14 2
+    LDI r14 5
+    STR r15 r14 3
+    LDI r14 5     // Edge 5 (vertices 5,7)
+    STR r15 r14 4
+    LDI r14 7
+    STR r15 r14 5
+    LDI r14 6     // Edge 6 (vertices 6,7)
+    STR r15 r14 6
+    LDI r14 7
+    STR r15 r14 7
 
-//
-//
-//
-//    ADI r15 8
-//    LDI r14 8     // Number of edges
-//    STR r15 r14 0
-//    LDI r14 0     // Edge 0 (vertices 0,1)
-//    STR r15 r14 1
-//    LDI r14 1
-//    STR r15 r14 2
-//    LDI r14 1     // Edge 1 (vertices 1,2)
-//    STR r15 r14 3
-//    LDI r14 2
-//    STR r15 r14 4
-//    LDI r14 2     // Edge 2 (vertices 2,3)
-//    STR r15 r14 5
-//    LDI r14 3
-//    STR r15 r14 6
-//    LDI r14 3     // Edge 3 (vertices 3,0)
-//    STR r15 r14 7
-//
-//    ADI r15 8
-//    LDI r14 0
-//    STR r15 r14 0
-//    LDI r14 0     // Edge 4 (vertices 0,4)
-//    STR r15 r14 1
-//    LDI r14 4
-//    STR r15 r14 2
-//    LDI r14 1     // Edge 5 (vertices 1,4)
-//    STR r15 r14 3
-//    LDI r14 4
-//    STR r15 r14 4
-//    LDI r14 2     // Edge 6 (vertices 2,4)
-//    STR r15 r14 5
-//    LDI r14 4
-//    STR r15 r14 6
-//    LDI r14 3     // Edge 7 (vertices 3,4)
-//    STR r15 r14 7
-//
-//    ADI r15 8
-//    LDI r14 4
-//    STR r15 r14 0
+    ADI r15 8
+    LDI r14 4     // Edge 7 (vertices 4,6)
+    STR r15 r14 0
+    LDI r14 6
+    STR r15 r14 1
+    LDI r14 0     // Edge 8 (vertices 0,4)
+    STR r15 r14 2
+    LDI r14 4
+    STR r15 r14 3
+    LDI r14 1     // Edge 9 (vertices 1,5)
+    STR r15 r14 4
+    LDI r14 5
+    STR r15 r14 5
+    LDI r14 2     // Edge 10 (vertices 2,6)
+    STR r15 r14 6
+    LDI r14 6
+    STR r15 r14 7
+
+    ADI r15 8
+    LDI r14 3     // Edge 11 (vertices 3,7)
+    STR r15 r14 0
+    LDI r14 7
+    STR r15 r14 1
     RET

From fb4190154d76ba83c47d5555e8524d8231f0b2f6 Mon Sep 17 00:00:00 2001
From: Dave <83719612+DaveJWalker@users.noreply.github.com>
Date: Sun, 15 Sep 2024 23:15:21 -0700
Subject: [PATCH 06/11] Added some comments.

---
 programs/wireframe.as | 39 +++++++++++++++++++++++----------------
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/programs/wireframe.as b/programs/wireframe.as
index ca42932..1bb6f40 100644
--- a/programs/wireframe.as
+++ b/programs/wireframe.as
@@ -1,18 +1,27 @@
 // Wireframe Demo by Dave Walker
 
-// A basic implementation of a CORDIC function operating in rotation mode.  A CORDIC can be
-// used to iteratively calculate sine and cosine of an angle. Due to the limitations of this
-// 8-bit computer, the CORDIC isn't particularly accurate.  A number of values in this code are
-// represented as fixed point representations.  Therefore, you'll see notations like u2.5 and
-// s1.6.  These notations denote signed/unsigned, the number of integer bits, and the number of
-// fractional bits.  For example, the sine/cosine outputs are all s1.6.
+// This program is an implementation of a 3D wireframe renderer on MattBatWing's BatPU-2
+// Minecraft computer. I created it because I wanted to try to reproduce the one Matt created
+// here:
 //
-// In addition to the CORDIC, a draw_line function is included based on Bresenham's Algorithm.
+// https://www.youtube.com/watch?v=hFRlnNci3Rs
 //
-// (Note: I'm not crazy with how I pass inputs to functions in this code using registers.  I think
-// I'd prefer to pass them via memory (i.e. a stack).  I'd also possibly like to dedicate a
-// register as a stack pointer.  I haven't written assembly in years so I'm not accustomed
-// to dealing with this stuff directly.  Perhaps I'll change it later... perhaps not.)
+// It began as a simple demo of a CORDIC function operating in rotation mode.  A CORDIC can be
+// used to iteratively calculate sine and cosine of an angle.  After the CORDIC, I "simply" had
+// to add Bresenham's line drawing algorithm, a 16-bit multiplier, 16-bit divider, a 3D rotation
+// function, a 3D-to-2D project function, and some other bits and bobs. :)
+//
+// It was fun to make, and I'm reasonably happy with it.  However, it's SLOW.  Way slower than
+// Matt's hardware implementation shown in the video above, which is totally expected.
+//
+// Note: The code is still pretty messy. I need to do a clean-up pass on it and make a few
+// more performance improvements (like moving the CORDIC function call outside of the vertice
+// loop).  I'm also considering modifying the CORDIC to use 16-bit math to improve its
+// accuracy, which should make the cube animation a bit smoother.  Also, I'm not crazy with
+// how I pass inputs to functions in this code using registers.  I think I'd prefer to pass
+// them via memory (i.e. a stack).  I'd also possibly like to dedicate a register as a
+// stack pointer.  I haven't written assembly in years so I'm not accustomed to dealing with
+// this stuff directly.  Perhaps I'll change it later... perhaps not.)
 
 
 // Memory mapped IO port mapping offsets
@@ -36,8 +45,6 @@ define rng_offset                  6
 define controller_input_offset     7
 
 // Various RAM addresses
-define x2_coord                    0
-define y2_coord                    1
 define register_stack_pointer      50
 define shape_vertices_edges_addr   100
 define projected_points_addr       150
@@ -90,7 +97,7 @@ STR r15 r0 buffer_chars_offset
 
 
 // Initialze rotation angle
-LDI r1 28
+LDI r1 0
 .main_loop
 
     // Point to 3D share vertice/edge table
@@ -216,6 +223,7 @@ LDI r1 28
 //    LDI r15 x2_coord
 //    STR r15 r3 x2_coord
 //    STR r15 r4 y2_coord
+    LOD r0 r1  0 // Load angle
 
     // Display current angle
     LDI r15 memory_mapped_io_addr
@@ -226,8 +234,7 @@ LDI r1 28
 //CAL .wait_for_user
 
     // Increment the angle and loop
-    LOD r0 r1  0 // Load angle
-    ADI r1 1
+    ADI r1 5
     LDI r14 201  // Ending angle
     CMP r1 r14
     BRH lt .main_loop

From b97953ad3e89ccc33745d85ac1c2c23cbcb22d87 Mon Sep 17 00:00:00 2001
From: Dave <83719612+DaveJWalker@users.noreply.github.com>
Date: Mon, 16 Sep 2024 00:26:29 -0700
Subject: [PATCH 07/11] Cleaned up main loop.

---
 programs/wireframe.as | 179 +++++++++++++++++++++---------------------
 1 file changed, 90 insertions(+), 89 deletions(-)

diff --git a/programs/wireframe.as b/programs/wireframe.as
index 1bb6f40..454eecd 100644
--- a/programs/wireframe.as
+++ b/programs/wireframe.as
@@ -1,18 +1,20 @@
 // Wireframe Demo by Dave Walker
 
-// This program is an implementation of a 3D wireframe renderer on MattBatWing's BatPU-2
-// Minecraft computer. I created it because I wanted to try to reproduce the one Matt created
-// here:
+// This program is an implementation of a 3D wireframe renderer running on MattBatWing's
+// BatPU-2 Minecraft computer. I created it because I wanted to try to reproduce renderer
+// Matt created here:
 //
 // https://www.youtube.com/watch?v=hFRlnNci3Rs
 //
 // It began as a simple demo of a CORDIC function operating in rotation mode.  A CORDIC can be
-// used to iteratively calculate sine and cosine of an angle.  After the CORDIC, I "simply" had
-// to add Bresenham's line drawing algorithm, a 16-bit multiplier, 16-bit divider, a 3D rotation
-// function, a 3D-to-2D project function, and some other bits and bobs. :)
+// used to iteratively calculate sine and cosine of an angle.  After the CORDIC, I "only" had
+// to add Bresenham's line drawing algorithm, a 16-bit multiplier, a 16-bit divider, a 3D
+// rotation function, a 3D-to-2D project function, and some other bits and bobs. :)
 //
-// It was fun to make, and I'm reasonably happy with it.  However, it's SLOW.  Way slower than
-// Matt's hardware implementation shown in the video above, which is totally expected.
+// It was fun to make, and I'm reasonably happy with it.  However, it's SLOW... way slower than
+// Matt's hardware implementation shown in the video above, which is totally expected.  I mean...
+// dedicated hardware is always going to be much faster than software running on a general
+// purpose processor.
 //
 // Note: The code is still pretty messy. I need to do a clean-up pass on it and make a few
 // more performance improvements (like moving the CORDIC function call outside of the vertice
@@ -44,10 +46,25 @@ define unsigned_mode_offset        5
 define rng_offset                  6
 define controller_input_offset     7
 
+// Shape Table Offsets
+define x_offset                    0
+define y_offset                    1
+define z_offset                    2
+define number_of_edges             0
+define edge_vertice_0              0
+define edge_vertice_1              1
+
+// Various addresses for storing values.  All are offset from r0.
+define rotation_angle              0
+define number_of_vertices          1
+define shape_table_pointer         2
+define projected_xy_pointer        3
+define edges_remaining             4
+
 // Various RAM addresses
 define register_stack_pointer      50
-define shape_vertices_edges_addr   100
-define projected_points_addr       150
+define projected_points_addr       100
+define shape_vertices_edges_addr   150
 define atan_LUT_strt_addr          232
 
 // Other constants
@@ -55,6 +72,8 @@ define y_axis                      0
 define x_axis                      1
 define z_axis                      2
 define focal_length                127
+define rotation_angle_increment    5
+define rotation_angle_max          201
 
 
 // Load the arctan LUT into RAM.
@@ -71,7 +90,7 @@ STR r15 r0 unsigned_mode_offset
 STR r15 r0 clear_chars_buffer_offset
 STR r15 r0 buffer_chars_offset
 
-// Write "3DROTATION"
+// Write "ROTATION"
 STR r15 r0 clear_chars_buffer_offset
 LDI r14 " "
 STR r15 r14 write_char_offset
@@ -100,7 +119,7 @@ STR r15 r0 buffer_chars_offset
 LDI r1 0
 .main_loop
 
-    // Point to 3D share vertice/edge table
+    // Point to 3D shape vertice/edge table
     LDI r15 shape_vertices_edges_addr
     LOD r15 r14 0   // Load number of vertices in r14
     INC r15         // And point to first vertice
@@ -111,136 +130,118 @@ LDI r1 0
     // Now loop through all of the 3D vertices in memory to
     // rotate and project them onto a 2D plane for display.
     .vertice_loop
-        LOD r15 r2 0
-        LOD r15 r3 1
-        LOD r15 r4 2
-        ADI r15 3
-        LDI r5 y_axis      // Rotation axis
-
-// TODO define all of these offsets as constants
-        STR r0 r1  0 // Store angle in RAM
-        STR r0 r14 1 // Store number of vertices RAM
-        STR r0 r15 2 // Store vertice pointer in RAM
-        STR r0 r13 3 // Store the projected points pointer in RAM
+        // Load the 3D x,y,z coordinates from RAM
+        LOD r15 r2 x_offset
+        LOD r15 r3 y_offset
+        LOD r15 r4 z_offset
+        ADI r15 3           // Point to the next set of coordinates
+        LDI r5 y_axis       // Set the rotation axis
+
+        // Push variables into RAM
+        STR r0 r1  rotation_angle
+        STR r0 r14 number_of_vertices
+        STR r0 r15 shape_table_pointer
+        STR r0 r13 projected_xy_pointer
 
         // TODO: Change rotation function so it doesn't call CORDIC
         // i.e. move CORDIC call outside of vertice loop since I don't
-        // need to calculate it again each time
+        // need to recalculate it for each vertice.  Doing so should
+        // significantly improve performance.
         CAL .rotation
         LDI r7 focal_length
         CAL .pixel_projection
+        // Center the projected points on the screen
         ADI r1 16
         ADI r2 16
 
         // Store the projected points in RAM
-        LDI r12 0
-        LOD r12 r13 3
-        STR r13 r1 0
-        STR r13 r2 1
+        LOD r0 r13 projected_xy_pointer
+        STR r13 r1 x_offset
+        STR r13 r2 y_offset
         ADI r13 2
-        STR r12 r13 3
-
-        // TODO get rid of this...
-//        LDI r15 memory_mapped_io_addr
-//        STR r15 r1 pixel_x_offset
-//        STR r15 r2 pixel_y_offset
-//        STR r15 r0 draw_pixel_offset
-//        STR r15 r0 buffer_screen_offset
+        STR r0 r13 projected_xy_pointer
 
-        LDI r12 0
-        LOD r12 r1  0 // Recall angle from RAM
-        LOD r12 r14 1 // Restore number of vertices from RAM
-        LOD r12 r15 2 // Restore vertice pointer from RAM
+        // Recall variables from RAM
+        LOD r0 r1   rotation_angle
+        LOD r0 r14  number_of_vertices
+        LOD r0 r15  shape_table_pointer
 
-        DEC r14
+        DEC r14   // Decrement the vertice counter and loop if more exist
         BRH nz .vertice_loop
 
-    // Now loop through all shape edges to draw lines
-    // First grab the number of edges
+    // After calculating projected xy for all vertices, loop through all shape edges to
+    // draw lines between them.  Each edge is defined as a pair of vertices.
     // r15 - pointer to shape table
-    // r6:r5 - vertice pair for each edge
+    // r14 - Number of shape edges
     // r12 - pointer to projected points table
     // r13 - number of vertices
-    // r11 - number of projected points left
-    // r10 - Index into projected points table
-    // r14 - Number of shape edges
-    LOD r15 r14 0
+    // r11 - number of projected points left for the current edge
+    // r10 - Projected points table index
+    // r6:r5 - vertice pair for each edge
+
+    // First grab the number of edges from the shape table
+    LOD r15 r14 number_of_edges
     INC r15
     .edge_loop
 
-//CAL .wait_for_user
         // Load vertice pair index for edge
-        LOD r15 r5 0
-        LOD r15 r6 1
+        LOD r15 r5 edge_vertice_0
+        LOD r15 r6 edge_vertice_1
+        // And point to the next edge vertice pair
         ADI r15 2
-        LOD r0 r13 1 // Get number of vertices
-        STR r0 r15 2
-        STR r0 r14 7
-        LDI r11 2    // Keep track of number of projected points to grab
-        LDI r10 0
+        STR r0 r15 shape_table_pointer
+        STR r0 r14 edges_remaining
         LDI r12 projected_points_addr
+        LDI r11 2     // Number of projected xy points to grab for each edge
+        LDI r10 0     // Projected xy table index
         .get_projected_xy
+            // Check both edge vertices again the projected xy table index
             CMP r5 r10
             BRH eq .store_projected_x0y0
             CMP r6 r10
             BRH eq .store_projected_x1y1
             JMP .next_projected_xy
             .store_projected_x0y0
-                LOD r12 r1 0
-                LOD r12 r2 1
+                LOD r12 r1 x_offset
+                LOD r12 r2 y_offset
                 DEC r11
                 JMP .next_projected_xy
             .store_projected_x1y1
-                LOD r12 r3 0
-                LOD r12 r4 1
+                LOD r12 r3 x_offset
+                LOD r12 r4 y_offset
                 DEC r11
             .next_projected_xy
-            ADI r12 2
-            INC r10
-            CMP r11 r0
+            ADI r12 2   // Point to the next project xy point
+            INC r10     // Increment the table index
+            CMP r11 r0  // ... and check if done
             BRH nz .get_projected_xy
 
-            CAL .draw_line
-//          LDI r12 0
-            LOD r0 r15 2  // TODO... everywhere I use zero as an address needs r0
-            LOD r0 r14 7
+        // Draw a line between the two projected xy points
+        CAL .draw_line
+
+        LOD r0 r15 shape_table_pointer
+        LOD r0 r14 edges_remaining
         DEC r14
         BRH nz .edge_loop
 
-        LDI r12 0
-        LOD r12 r15 2
-
-
-//    // Push the r13 angle value to RAM since it gets modified inside the draw_line function
-//    LDI r15 register_stack_pointer
-//    STR r15 r13
-//    CAL .draw_line
-//    // And pop it back off when finished
-//    LDI r15 register_stack_pointer
-//    LOD r15 r13
-
-//    // Store the x1/y1 coordinates to RAM so they can be x2/y2 next iteration
-//    LDI r15 x2_coord
-//    STR r15 r3 x2_coord
-//    STR r15 r4 y2_coord
-    LOD r0 r1  0 // Load angle
+    // Load the rotation angle from RAM
+    LOD r0 r1  rotation_angle
 
     // Display current angle
     LDI r15 memory_mapped_io_addr
     STR r15 r1 show_number_offset
+
+    // Update the screen
     STR r15 r0 buffer_screen_offset
     STR r15 r0 clear_screen_buffer_offset
 
-//CAL .wait_for_user
-
     // Increment the angle and loop
-    ADI r1 5
-    LDI r14 201  // Ending angle
+    ADI r1  rotation_angle_increment
+    LDI r14 rotation_angle_max
     CMP r1 r14
     BRH lt .main_loop
     SUB r1 r14 r1
     JMP .main_loop
-HLT
 
 
 .wait_for_user

From 161608f689c77b50b4728d91194fc7dbed5f1656 Mon Sep 17 00:00:00 2001
From: Dave <83719612+DaveJWalker@users.noreply.github.com>
Date: Mon, 16 Sep 2024 01:55:16 -0700
Subject: [PATCH 08/11] Moved CORDIC function call outside of vertice loop.

Moving the CORDIC improves performance but not nearly as much as I expected.  Move of the time is spent in the multiply and divide functions I suppose.
---
 programs/wireframe.as | 97 ++++++++++++++++++++++++++-----------------
 1 file changed, 58 insertions(+), 39 deletions(-)

diff --git a/programs/wireframe.as b/programs/wireframe.as
index 454eecd..20b78a0 100644
--- a/programs/wireframe.as
+++ b/programs/wireframe.as
@@ -60,6 +60,8 @@ define number_of_vertices          1
 define shape_table_pointer         2
 define projected_xy_pointer        3
 define edges_remaining             4
+define cosine                      5
+define sine                        6
 
 // Various RAM addresses
 define register_stack_pointer      50
@@ -72,7 +74,7 @@ define y_axis                      0
 define x_axis                      1
 define z_axis                      2
 define focal_length                127
-define rotation_angle_increment    5
+define rotation_angle_increment    1
 define rotation_angle_max          201
 
 
@@ -115,9 +117,20 @@ STR r15 r14 write_char_offset
 STR r15 r0 buffer_chars_offset
 
 
-// Initialze rotation angle
+// Initialze rotation angle and store it in RAM
 LDI r1 0
+
 .main_loop
+    // Save the rotation angle at the start of each loop (since the
+    // CORDIC function modifies it)
+    STR r0 r1 rotation_angle
+
+    // Call the CORDIC function to calculate sine and cosine for the
+    // rotation angle
+    CAL .cordic
+    LOD r0 r1 rotation_angle
+    STR r0 r2 cosine
+    STR r0 r3 sine
 
     // Point to 3D shape vertice/edge table
     LDI r15 shape_vertices_edges_addr
@@ -131,11 +144,11 @@ LDI r1 0
     // rotate and project them onto a 2D plane for display.
     .vertice_loop
         // Load the 3D x,y,z coordinates from RAM
-        LOD r15 r2 x_offset
-        LOD r15 r3 y_offset
-        LOD r15 r4 z_offset
+        LOD r15 r3 x_offset
+        LOD r15 r4 y_offset
+        LOD r15 r5 z_offset
         ADI r15 3           // Point to the next set of coordinates
-        LDI r5 y_axis       // Set the rotation axis
+        LDI r6 y_axis       // Set the rotation axis
 
         // Push variables into RAM
         STR r0 r1  rotation_angle
@@ -143,10 +156,9 @@ LDI r1 0
         STR r0 r15 shape_table_pointer
         STR r0 r13 projected_xy_pointer
 
-        // TODO: Change rotation function so it doesn't call CORDIC
-        // i.e. move CORDIC call outside of vertice loop since I don't
-        // need to recalculate it for each vertice.  Doing so should
-        // significantly improve performance.
+        LOD r0 r1 cosine
+        LOD r0 r2 sine
+
         CAL .rotation
         LDI r7 focal_length
         CAL .pixel_projection
@@ -211,7 +223,7 @@ LDI r1 0
                 LOD r12 r4 y_offset
                 DEC r11
             .next_projected_xy
-            ADI r12 2   // Point to the next project xy point
+            ADI r12 2   // Point to the next projected xy point
             INC r10     // Increment the table index
             CMP r11 r0  // ... and check if done
             BRH nz .get_projected_xy
@@ -262,8 +274,8 @@ LDI r1 0
 //  r1  = angle in radians (fixed point in the form u2.5)
 //        (Values between 0 and 2*pi are supported.)
 // Outputs:
-//  r2  = sine  (r1)
-//  r3  = cosine(r1)
+//  r2  = cosine(r1)
+//  r3  = sine  (r1)
 // Register usage
 //  r1  - angle in radians (s1.6)
 //  r2  - x (s1.6)
@@ -881,12 +893,12 @@ LDI r1 0
 //    2 = [y,z]
 //
 // Inputs:
-//    r1  = rotation angle in radians (fixed point in the form u2.5)
-//          (Values between 0 and 2*pi are supported.)  // TODO... make input and outputs consistent
-//    r2 = x
-//    r3 = y
-//    r4 = z
-//    r5 = rotation axis
+//    r1 = cosine(A) (fixed point in the form s1.6)
+//    r2 = sine  (A) (fixed point in the form s1.6)
+//    r3 = x
+//    r4 = y
+//    r5 = z
+//    r6 = rotation axis
 //
 // Outputs:
 //    r2:r1 = rotated_x
@@ -938,35 +950,42 @@ LDI r1 0
     ADI r14 8
 
     // Organize input coordinates in RAM according to the
-    // rotation axis selection input (r5)
-    STR r15 r5 rot_axis_sel
-    LDI r6 x_axis
-    CMP r5 r6
+    // rotation axis selection input (r6)
+    STR r15 r6 rot_axis_sel
+    LDI r7 x_axis
+    CMP r6 r7
     BRH z .go_x_axis_sel
-    LDI r6 z_axis
-    CMP r5 r6
+    LDI r7 z_axis
+    CMP r6 r7
     BRH z .go_z_axis_sel
     .go_y_axis_sel  // Default
-        STR r15 r2 rot_coord1
-        STR r15 r4 rot_coord2
-        STR r15 r3 rot_coord3
-        JMP .calc_cordic
-    .go_z_axis_sel
-        STR r15 r2 rot_coord1
-        STR r15 r3 rot_coord2
+        STR r15 r3 rot_coord1
+        STR r15 r5 rot_coord2
         STR r15 r4 rot_coord3
         JMP .calc_cordic
-    .go_x_axis_sel
+    .go_z_axis_sel
         STR r15 r3 rot_coord1
         STR r15 r4 rot_coord2
-        STR r15 r2 rot_coord3
+        STR r15 r5 rot_coord3
+        JMP .calc_cordic
+    .go_x_axis_sel
+        STR r15 r4 rot_coord1
+        STR r15 r5 rot_coord2
+        STR r15 r3 rot_coord3
 
     .calc_cordic
-    // Call the CORDIC function to calculate sine and cosine for the
-    // rotation angle
-    CAL .cordic
-
-    // Convert them to 16-bit values for the multiplications below.
+    // Move cosine/sine inputs to match output of cordic function
+    // previous called below (so I don't have to juggle around a bunch
+    // of registers)
+    MOV r2 r3
+    MOV r1 r2
+    // NOTE: This call was moved outside of the main vertice loop to
+    // improve performance.
+//    // Call the CORDIC function to calculate sine and cosine for the
+//    // rotation angle
+//    CAL .cordic
+
+    // Convert sine and cosine to 16-bit values for the multiplication`s below.
     LDI r12 128
     LDI r1 0
     AND r12 r2 r0     // Check sign bit

From eb36339813cf44311e07af352655a2cb9fcbc794 Mon Sep 17 00:00:00 2001
From: Dave <83719612+DaveJWalker@users.noreply.github.com>
Date: Mon, 16 Sep 2024 01:58:00 -0700
Subject: [PATCH 09/11] Set angle increment back to 5.

---
 programs/wireframe.as | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/programs/wireframe.as b/programs/wireframe.as
index 20b78a0..d2c5d62 100644
--- a/programs/wireframe.as
+++ b/programs/wireframe.as
@@ -74,7 +74,7 @@ define y_axis                      0
 define x_axis                      1
 define z_axis                      2
 define focal_length                127
-define rotation_angle_increment    1
+define rotation_angle_increment    5
 define rotation_angle_max          201
 
 
From 2df03b1635a60fa1d26effbbefefd195765b121e Mon Sep 17 00:00:00 2001
From: Dave <83719612+DaveJWalker@users.noreply.github.com>
Date: Mon, 16 Sep 2024 02:49:23 -0700
Subject: [PATCH 10/11] More comment changes.

---
 programs/wireframe.as | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/programs/wireframe.as b/programs/wireframe.as
index d2c5d62..fa1a3b4 100644
--- a/programs/wireframe.as
+++ b/programs/wireframe.as
@@ -15,15 +15,6 @@
 // Matt's hardware implementation shown in the video above, which is totally expected.  I mean...
 // dedicated hardware is always going to be much faster than software running on a general
 // purpose processor.
-//
-// Note: The code is still pretty messy. I need to do a clean-up pass on it and make a few
-// more performance improvements (like moving the CORDIC function call outside of the vertice
-// loop).  I'm also considering modifying the CORDIC to use 16-bit math to improve its
-// accuracy, which should make the cube animation a bit smoother.  Also, I'm not crazy with
-// how I pass inputs to functions in this code using registers.  I think I'd prefer to pass
-// them via memory (i.e. a stack).  I'd also possibly like to dedicate a register as a
-// stack pointer.  I haven't written assembly in years so I'm not accustomed to dealing with
-// this stuff directly.  Perhaps I'll change it later... perhaps not.)
 
 
 // Memory mapped IO port mapping offsets

From 80b42eaa1e74f9178323e133473dd72a7141b492 Mon Sep 17 00:00:00 2001
From: Dave <83719612+DaveJWalker@users.noreply.github.com>
Date: Mon, 16 Sep 2024 12:33:10 -0700
Subject: [PATCH 11/11] Fixed x_axis rotation bug.

---
 programs/wireframe.as | 115 ++++++++++++++++++++++++------------------
 1 file changed, 66 insertions(+), 49 deletions(-)

diff --git a/programs/wireframe.as b/programs/wireframe.as
index fa1a3b4..e35489f 100644
--- a/programs/wireframe.as
+++ b/programs/wireframe.as
@@ -139,7 +139,7 @@ LDI r1 0
         LOD r15 r4 y_offset
         LOD r15 r5 z_offset
         ADI r15 3           // Point to the next set of coordinates
-        LDI r6 y_axis       // Set the rotation axis
+        LDI r6 x_axis       // Set the rotation axis
 
         // Push variables into RAM
         STR r0 r1  rotation_angle
@@ -147,8 +147,8 @@ LDI r1 0
         STR r0 r15 shape_table_pointer
         STR r0 r13 projected_xy_pointer
 
-        LOD r0 r1 cosine
-        LOD r0 r2 sine
+        LOD r0 r1  cosine
+        LOD r0 r2  sine
 
         CAL .rotation
         LDI r7 focal_length
@@ -165,9 +165,9 @@ LDI r1 0
         STR r0 r13 projected_xy_pointer
 
         // Recall variables from RAM
-        LOD r0 r1   rotation_angle
-        LOD r0 r14  number_of_vertices
-        LOD r0 r15  shape_table_pointer
+        LOD r0 r1  rotation_angle
+        LOD r0 r14 number_of_vertices
+        LOD r0 r15 shape_table_pointer
 
         DEC r14   // Decrement the vertice counter and loop if more exist
         BRH nz .vertice_loop
@@ -778,6 +778,9 @@ LDI r1 0
 
 .pixel_projection
 // The following function is used to project a 3D point in space onto a 2D plane.
+// It utilizes "weak" perspective projection as described below:
+//
+//    https://en.wikipedia.org/wiki/3D_projection
 //
 // Given a 3D coordinate [x,y,z], it'll return x_projected and y_projected by
 // calculating the following:
@@ -862,26 +865,38 @@ LDI r1 0
 
     // Move x_projected and y_projected to output registers
     MOV r1 r2                   // y_projected -> r2
-    LOD r15 r1 pp_x_projected  // x_projected -> r1
+    LOD r15 r1 pp_x_projected   // x_projected -> r1
     RET
 
 
 .rotation
-// This function applies the following rotation matrix to
-// a set of two coordinates:
+// This function rotates a 3D xyz coordinate around a
+// selected axis of rotation.  It does so by multiplying
+// the [x,y,z] vector by one of three rotation matrices R.
+//
+//           +-                 -+
+//           |  cos(A)  -sin(A)  |
+//      Rx = |                   |
+//           |  sin(A)   cos(A)  |
+//           +-                 -+
+//
+//           +-                 -+
+//           |  cos(A)   sin(A)  |
+//      Ry = |                   |
+//           | -sin(A)   cos(A)  |
+//           +-                 -+
 //
-//           +-               -+
-//           | cos(A)  -sin(A) |
-//      Rz = |                 |
-//           | sin(A)   cos(A) |
-//           +-               -+
+//           +-                 -+
+//           |  cos(A)  -sin(A)  |
+//      Rz = |                   |
+//           |  sin(A)   cos(A)  |
+//           +-                 -+
 //
-// Given a 3D coordinate (composed of xyz values), the
-// coordinate pair utilized can be selected based on the
-// coordinate pair selection input as follows:
-//    0 = [x,y]
-//    1 = [x,z]
-//    2 = [y,z]
+// In all cases, the coordinate for the axis of rotation
+// does not change.  Refer to the following Wikipedia
+// article for details:
+//
+// https://en.wikipedia.org/wiki/Rotation_matrix 
 //
 // Inputs:
 //    r1 = cosine(A) (fixed point in the form s1.6)
@@ -905,9 +920,9 @@ LDI r1 0
     define rot_coord2          1 // 2nd rotation coordinate
     define rot_coord3          2 // 3rd rotation coordinate (fixed)
     define rot_axis_sel        3 // rotation axis
-    define rot_cosine_high     4 // Cosine
+    define rot_cosine_high     4 // cosine
     define rot_cosine_low      5
-    define rot_sine_high       6 // Sine
+    define rot_sine_high       6 // sine
     define rot_sine_low        7
 
     define coord1_x_cosine_low  -1
@@ -933,7 +948,7 @@ LDI r1 0
     define rot_coord3_high   2
     define rot_coord3_low    3
 
-    // Set up pointer for temporary storage
+    // Set up pointers for temporary storage
     LDI r15 register_stack_pointer
     ADI r15 8
     MOV r15 r14
@@ -953,18 +968,18 @@ LDI r1 0
         STR r15 r3 rot_coord1
         STR r15 r5 rot_coord2
         STR r15 r4 rot_coord3
-        JMP .calc_cordic
+        JMP .adjst_trig
     .go_z_axis_sel
         STR r15 r3 rot_coord1
         STR r15 r4 rot_coord2
         STR r15 r5 rot_coord3
-        JMP .calc_cordic
+        JMP .adjst_trig
     .go_x_axis_sel
         STR r15 r4 rot_coord1
         STR r15 r5 rot_coord2
         STR r15 r3 rot_coord3
 
-    .calc_cordic
+    .adjst_trig
     // Move cosine/sine inputs to match output of cordic function
     // previous called below (so I don't have to juggle around a bunch
     // of registers)
@@ -1026,14 +1041,15 @@ LDI r1 0
     STR r15 r4 coord2_x_cosine_low
     STR r15 r5 coord2_x_cosine_high
 
+
     // Calculate coord1_rotation = coord1*cosine(A) +/- coord2*sine(A)
     LOD r15 r3 coord1_x_cosine_low
     LOD r15 r4 coord1_x_cosine_high
     LOD r15 r5 coord2_x_sine_low
     LOD r15 r6 coord2_x_sine_high
 
-    // Perform 16-bit addition/subtraction (emulating SBB)
-    // Addition/subtraction depends on the axis of rotation TODO better comments
+    // Perform 16-bit addition/subtraction, depending on the axis
+    // of rotation.
     LOD r15 r8 rot_axis_sel   // Restore axis selection
     LDI r9 y_axis
     CMP r8 r9
@@ -1071,12 +1087,13 @@ LDI r1 0
     LOD r15 r5 coord2_x_cosine_low
     LOD r15 r6 coord2_x_cosine_high
 
-    // Perform 16-bit addition/subtraction TODO better comments
+    // Perform 16-bit addition/subtraction, again depending on the axis
+    // of rotation.
     LOD r15 r8 rot_axis_sel   // Restore axis selection
     LDI r9 y_axis
     CMP r8 r9
     BRH z .coord2_sub
-    .coord2_add     // For rotations around axis x & z, adding is performed
+    .coord2_add     // For rotations around axis x & z, addition is performed
         LDI r7 0
         ADD r3 r5 r3      // Add low bytes
         BRH nc .roty_no_carry
@@ -1093,8 +1110,8 @@ LDI r1 0
         .roty_no_borrow
         SUB r6 r4 r4      // Subtract high bytes
         SUB r4 r7 r4      // Handle borrow
-    .coord2_round
 
+    .coord2_round
     // Move and round the results
     MOV r3 r1
     MOV r4 r2
@@ -1115,21 +1132,13 @@ LDI r1 0
 
     // Organize outputs coordinates based on the rotation axis selection.
     LOD r15 r5 rot_axis_sel   // Restore axis selection
-    LDI r6 y_axis
-    CMP r5 r6
-    BRH z .rstr_xz_sel
     LDI r6 x_axis
     CMP r5 r6
-    BRH z .rstr_yz_sel
-    .rstr_x_axis // Default selection
-        LOD r14 r2 rot_coord1_high
-        LOD r14 r1 rot_coord1_low
-        LOD r14 r4 rot_coord2_high
-        LOD r14 r3 rot_coord2_low
-        LOD r14 r6 rot_coord3_high
-        LOD r14 r5 rot_coord3_low
-        JMP .rot_exit
-    .rstr_xz_sel
+    BRH z .rstr_x_axis
+    LDI r6 z_axis
+    CMP r5 r6
+    BRH z .rstr_z_axis
+    .rstr_y_axis  // Default selection
         LOD r14 r2 rot_coord1_high
         LOD r14 r1 rot_coord1_low
         LOD r14 r4 rot_coord3_high
@@ -1137,13 +1146,21 @@ LDI r1 0
         LOD r14 r6 rot_coord2_high
         LOD r14 r5 rot_coord2_low
         JMP .rot_exit
-    .rstr_yz_sel
-        LOD r14 r2 rot_coord2_high
-        LOD r14 r1 rot_coord2_low
-        LOD r14 r6 rot_coord3_high
-        LOD r14 r5 rot_coord3_low
+    .rstr_x_axis
+        LOD r14 r2 rot_coord3_high
+        LOD r14 r1 rot_coord3_low
         LOD r14 r4 rot_coord1_high
         LOD r14 r3 rot_coord1_low
+        LOD r14 r6 rot_coord2_high
+        LOD r14 r5 rot_coord2_low
+        JMP .rot_exit
+    .rstr_z_axis
+        LOD r14 r2 rot_coord1_high
+        LOD r14 r1 rot_coord1_low
+        LOD r14 r4 rot_coord2_high
+        LOD r14 r3 rot_coord2_low
+        LOD r14 r6 rot_coord3_high
+        LOD r14 r5 rot_coord3_low
     .rot_exit
     RET