Multithread SDL renderer
diff --git a/CMakeLists.txt b/CMakeLists.txt
index fe0e4cd..8fc9b17 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,6 +3,7 @@
 project(6502 VERSION 0.1.0 LANGUAGES C)
 
 option(GEN_INSTRUCTIONS_HEADER ON)
+option(NO_PTHREAD OFF)
 
 include_directories(nuklear)
 
@@ -14,5 +15,11 @@
 		WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
 endif()
 
+if (${NO_PTHREAD})
+	add_definitions(-DNO_PTHREAD)
+else()
+	set(THREAD pthread)
+endif()
+
 add_executable(6502 main.c cpu.c cpu.h dbg.c dbg.h instructions.h gui.h gui.c screen.h screen.c)
-target_link_libraries(6502 readline SDL2 GL GLU GLEW m)
+target_link_libraries(6502 readline SDL2 GL GLU GLEW m ${THREAD})
diff --git a/README.md b/README.md
index eb48cbf..3f34714 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,6 @@
 # 6502 Toolchain
 
 <center>
-**Click for demonstration:**
-
 [![Screenshot](screenshot.png)](colors.webm)
 </center>
 
diff --git a/cpu.c b/cpu.c
index cfcd89d..6e56134 100644
--- a/cpu.c
+++ b/cpu.c
@@ -33,6 +33,7 @@
 	cpu.pc = 0x600; // arbitrary program counter start
 	cpu.running = true;
 	cpu.mem = malloc(0xFFFF);
+	cpu.screen_dirty = true;
 	memset(cpu.mem, 0, 0xFFFF);
 
 	if (!cpu.mem)
@@ -95,36 +96,36 @@
 }
 
 // rotate right
-inline uint8_t ror(uint8_t a, uint8_t n)
+uint8_t ror(uint8_t a, uint8_t n)
 {
 	return (a >> n) | (a << (8 - n));
 }
 
 // rotate left
-inline uint8_t rol(uint8_t a, uint8_t n)
+uint8_t rol(uint8_t a, uint8_t n)
 {
 	return (a << n) | (a >> (8 - n));
 }
 
-inline void stat_nz(cpu_t *cpu, int8_t v)
+void stat_nz(cpu_t *cpu, int8_t v)
 {
 	cpu->status.negative = v < 0;
 	cpu->status.zero = v == 0;
 }
 
 // Used to check for overflow, is c unique?
-inline bool last_unique(bool a, bool b, bool c)
+bool last_unique(bool a, bool b, bool c)
 {
 	return a == b && a != c;
 }
 
-inline void stat_cv(cpu_t *cpu, uint8_t a, uint8_t b, uint8_t c)
+void stat_cv(cpu_t *cpu, uint8_t a, uint8_t b, uint8_t c)
 {
 	cpu->status.overflow = last_unique(a >> 7, b >> 7, c >> 7);
 	cpu->status.carry = c < a || c < b;
 }
 
-inline void cmp(cpu_t *cpu, uint8_t reg, uint8_t mem)
+void cmp(cpu_t *cpu, uint8_t reg, uint8_t mem)
 {
 	cpu->status.negative = 0;
 	cpu->status.zero = 0;
@@ -144,6 +145,14 @@
 	}
 }
 
+uint16_t scr_dirty(cpu_t *cpu, uint16_t mem)
+{
+	if (mem >= 0x200 && mem <= 0x200 + 32 * 32)
+		cpu->screen_dirty = true;
+
+	return mem;
+}
+
 void execute(cpu_t *cpu, const char *mnemonic, uint8_t op, arg_t a, uint8_t am)
 {
 	// used to save space
@@ -164,7 +173,7 @@
 
 		#define R(reg) \
 			case ST##reg: \
-				cpu->mem[a.ptr] = cpu->regs[reg]; \
+				cpu->mem[scr_dirty(cpu, a.ptr)] = cpu->regs[reg]; \
 				break; \
 
 			REGS
@@ -194,8 +203,8 @@
 		}
 
 		case INC:
-			cpu->mem[a.ptr]++;
-			stat_nz(cpu, cpu->mem[a.ptr]);
+			cpu->mem[scr_dirty(cpu, scr_dirty(cpu, a.ptr))]++;
+			stat_nz(cpu, cpu->mem[scr_dirty(cpu, a.ptr)]);
 			break;
 
 		case INX:
@@ -209,8 +218,8 @@
 			break;
 
 		case DEC:
-			cpu->mem[a.ptr]--;
-			stat_nz(cpu, cpu->mem[a.ptr]);
+			cpu->mem[scr_dirty(cpu, a.ptr)]--;
+			stat_nz(cpu, cpu->mem[scr_dirty(cpu, a.ptr)]);
 			break;
 
 		case DEX:
@@ -235,9 +244,9 @@
 			}
 			else
 			{
-				cpu->status.carry = cpu->mem[a.val] >> 7;
-				cpu->mem[a.ptr] <<= 1;
-				stat_nz(cpu, cpu->mem[a.ptr]);
+				cpu->status.carry = cpu->mem[scr_dirty(cpu, a.val)] >> 7;
+				cpu->mem[scr_dirty(cpu, a.ptr)] <<= 1;
+				stat_nz(cpu, cpu->mem[scr_dirty(cpu, a.ptr)]);
 			}
 			break;
 
@@ -250,9 +259,9 @@
 			}
 			else
 			{
-				cpu->status.carry = cpu->mem[a.val] & 7;
-				cpu->mem[a.ptr] >>= 1;
-				stat_nz(cpu, cpu->mem[a.ptr]);
+				cpu->status.carry = cpu->mem[scr_dirty(cpu, a.val)] & 7;
+				cpu->mem[scr_dirty(cpu, a.ptr)] >>= 1;
+				stat_nz(cpu, cpu->mem[scr_dirty(cpu, a.ptr)]);
 			}
 			break;
 
@@ -265,9 +274,9 @@
 			}
 			else
 			{
-				cpu->status.carry = cpu->mem[a.val] >> 7;
-				cpu->mem[a.ptr] = rol(a.val, 1);
-				stat_nz(cpu, cpu->mem[a.ptr]);
+				cpu->status.carry = cpu->mem[scr_dirty(cpu, a.val)] >> 7;
+				cpu->mem[scr_dirty(cpu, a.ptr)] = rol(a.val, 1);
+				stat_nz(cpu, cpu->mem[scr_dirty(cpu, a.ptr)]);
 			}
 			break;
 
@@ -280,9 +289,9 @@
 			}
 			else
 			{
-				cpu->status.carry = cpu->mem[a.val] & 1;
-				cpu->mem[a.ptr] = ror(a.val, 1);
-				stat_nz(cpu, cpu->mem[a.ptr]);
+				cpu->status.carry = cpu->mem[scr_dirty(cpu, a.val)] & 1;
+				cpu->mem[scr_dirty(cpu, a.ptr)] = ror(a.val, 1);
+				stat_nz(cpu, cpu->mem[scr_dirty(cpu, a.ptr)]);
 			}
 			break;
 
@@ -411,19 +420,19 @@
 	#undef REGS
 }
 
-inline uint16_t fetch_le(cpu_t *cpu)
+uint16_t fetch_le(cpu_t *cpu)
 {
-	uint8_t a = cpu->mem[cpu->pc++];
-	uint8_t b = cpu->mem[cpu->pc++];
+	uint8_t a = cpu->mem[scr_dirty(cpu, cpu->pc++)];
+	uint8_t b = cpu->mem[scr_dirty(cpu, cpu->pc++)];
 	return le_to_native(a, b);
 }
 
-inline arg_t arg_imm(uint16_t a)
+arg_t arg_imm(uint16_t a)
 {
 	return (arg_t){ a, a };
 }
 
-inline arg_t arg_ptr(cpu_t *c, uint flags, uint16_t p)
+arg_t arg_ptr(cpu_t *c, uint flags, uint16_t p)
 {
 	if (flags & FETCH_NO_INDIRECTION)
 		return arg_imm(p);
@@ -431,7 +440,7 @@
 	return (arg_t){ c->mem[p], p };
 }
 
-inline arg_t arg(uint16_t v, uint16_t a)
+arg_t arg(uint16_t v, uint16_t a)
 {
 	return (arg_t){ v, a };
 }
@@ -447,7 +456,7 @@
 		// In both cases return immediate 8 bit value
 		case AM_IMM:
 		case AM_ZP:
-			return arg_imm(cpu->mem[cpu->pc++]);
+			return arg_imm(cpu->mem[scr_dirty(cpu, cpu->pc++)]);
 
 		case AM_ABS:
 			return arg_ptr(cpu, f, fetch_le(cpu));
@@ -459,7 +468,7 @@
 			// I have discovered this through testing the output of other
 			// assemblers.
 			uint16_t pc = cpu->pc + 1;
-			return arg_ptr(cpu, f, (int8_t)cpu->mem[cpu->pc++] + pc);
+			return arg_ptr(cpu, f, (int8_t)cpu->mem[scr_dirty(cpu, cpu->pc++)] + pc);
 		}
 
 		case AM_IND:
@@ -469,8 +478,8 @@
 			if (f & FETCH_NO_INDIRECTION)
 				return arg_imm(addr);
 
-			uint8_t low = cpu->mem[addr],
-				high = cpu->mem[addr + 1];
+			uint8_t low = cpu->mem[scr_dirty(cpu, addr)],
+				high = cpu->mem[scr_dirty(cpu, addr + 1)];
 
 			return arg_ptr(cpu, f, le_to_native(low, high));
 		}
@@ -489,34 +498,34 @@
 
 		case AM_ZPX:
 			if (f & FETCH_NO_INDIRECTION)
-				return arg_ptr(cpu, f, cpu->mem[cpu->pc++]);
-			return arg_ptr(cpu, f, cpu->mem[cpu->pc++] + cpu->regs[X]);
+				return arg_ptr(cpu, f, cpu->mem[scr_dirty(cpu, cpu->pc++)]);
+			return arg_ptr(cpu, f, cpu->mem[scr_dirty(cpu, cpu->pc++)] + cpu->regs[X]);
 
 		case AM_ZPY:
 			if (f & FETCH_NO_INDIRECTION)
-				return arg_ptr(cpu, f, cpu->mem[cpu->pc++]);
-			return arg_ptr(cpu, f, cpu->mem[cpu->pc++] + cpu->regs[Y]);
+				return arg_ptr(cpu, f, cpu->mem[scr_dirty(cpu, cpu->pc++)]);
+			return arg_ptr(cpu, f, cpu->mem[scr_dirty(cpu, cpu->pc++)] + cpu->regs[Y]);
 
 		case AM_ZIX:
 		{
-			uint8_t zp = cpu->mem[cpu->pc++];
+			uint8_t zp = cpu->mem[scr_dirty(cpu, cpu->pc++)];
 
 			if (f & FETCH_NO_INDIRECTION)
 				return arg_imm(zp);
 
 			uint16_t addr = zp + cpu->regs[X];
-			uint16_t indirect = le_to_native(cpu->mem[addr], cpu->mem[addr + 1]);
+			uint16_t indirect = le_to_native(cpu->mem[scr_dirty(cpu, addr)], cpu->mem[scr_dirty(cpu, addr + 1)]);
 			return arg_ptr(cpu, f, indirect);
 		}
 
 		case AM_ZIY:
 		{
-			uint8_t zp = cpu->mem[cpu->pc++];
+			uint8_t zp = cpu->mem[scr_dirty(cpu, cpu->pc++)];
 
 			if (f & FETCH_NO_INDIRECTION)
 				return arg_imm(zp);
 
-			uint16_t base = le_to_native(cpu->mem[zp], cpu->mem[zp + 1]);
+			uint16_t base = le_to_native(cpu->mem[scr_dirty(cpu, zp)], cpu->mem[scr_dirty(cpu, zp + 1)]);
 			return arg_ptr(cpu, f, base + cpu->regs[Y]);
 		}
 
@@ -526,11 +535,12 @@
 	}
 }
 
-inline void step(cpu_t *cpu)
+void step(cpu_t *cpu)
 {
 	static int steps;
 	steps++;
-	switch (cpu->mem[cpu->pc++])
+	cpu->screen_dirty = false;
+	switch (cpu->mem[scr_dirty(cpu, cpu->pc++)])
 	{
 #define INST(mn, am, op) \
 		case op: \
@@ -548,10 +558,13 @@
 	if (steps % 100 == 0)
 		printf("%d\n", steps);
 
+// If can't run screen in seperate thread, just run it here (bad)
+#ifdef NO_PTHREAD
 	if (g_scr)
 	{
-		sdl_screen(g_scr, cpu->mem + CPU_FB_ADDR);
+		sdl_screen(g_scr, cpu->mem + CPU_FB_ADDR, cpu->screen_dirty);
 	}
+#endif
 }
 
 int dump_inst(cpu_t *cpu, char *buf, const char *mn, uint16_t addr, uint8_t am)
@@ -601,7 +614,7 @@
 	char *end = buffer;
 
 	// end += sprintf(buffer, "$%x", cpu->pc);
-	uint8_t op = cpu->mem[cpu->pc++];
+	uint8_t op = cpu->mem[scr_dirty(cpu, cpu->pc++)];
 	switch (op)
 	{
 #define INST(mn, am, op) \
diff --git a/cpu.h b/cpu.h
index cee84c5..5b91ed4 100644
--- a/cpu.h
+++ b/cpu.h
@@ -128,6 +128,7 @@
 	status_t status;
 	uint8_t *mem;
 	bool running;
+	bool screen_dirty;
 } cpu_t;
 
 // Argument type, includes both pointer and its value
diff --git a/main.c b/main.c
index 84e9779..e91e699 100644
--- a/main.c
+++ b/main.c
@@ -92,8 +92,12 @@
 
 	if (scrflag)
 	{
+#ifndef NO_PTHREAD
+		start_screen_thread(cpu.mem + CPU_FB_ADDR);
+#else
 		sdl_screen_t scr = new_sdl_screen(8);
 		g_scr = &scr;
+#endif
 	}
 
 	if (guiflag)
@@ -113,6 +117,9 @@
 		debug(&cpu);
 	}
 	
+	if (scrflag)
+		free_sdl_screen(g_scr);
+
 	if (should_read)
 		free_cpu(&cpu);
 }
diff --git a/screen.c b/screen.c
index fedc151..e2bf5e0 100644
--- a/screen.c
+++ b/screen.c
@@ -3,6 +3,10 @@
 
 #include <SDL2/SDL.h>
 
+#ifndef NO_PTHREAD
+#include <pthread.h>
+#endif
+
 struct nk_color byte_to_color(uint8_t b)
 {
 	struct nk_color c;
@@ -50,13 +54,27 @@
 		0);
 	scr.size = size;
 	scr.r = SDL_CreateRenderer(scr.win, -1, SDL_RENDERER_ACCELERATED | SDL_RENDERER_PRESENTVSYNC);
+	scr.tex = SDL_CreateTexture(scr.r, SDL_PIXELFORMAT_RGB332, SDL_TEXTUREACCESS_STREAMING,
+		CPU_FB_W, CPU_FB_H);
 
 	return scr;
 }
 
-void sdl_screen(sdl_screen_t *scr, uint8_t *mem)
+void free_sdl_screen(sdl_screen_t *scr)
 {
-	SDL_RenderClear(scr->r);
+	//free(scr->fb);
+	SDL_DestroyTexture(scr->tex);
+	SDL_DestroyRenderer(scr->r);
+	SDL_DestroyWindow(scr->win);
+}
+
+bool sdl_screen(sdl_screen_t *scr, uint8_t *mem, bool dirty)
+{
+	static bool texture_set = false;
+	if (!texture_set)
+	{
+		SDL_UpdateTexture(scr->tex, NULL, mem, CPU_FB_W);
+	}
 
 	SDL_Event e;
 
@@ -65,28 +83,42 @@
 		switch (e.type)
 		{
 			case SDL_QUIT:
-				exit(0);
+				return true;
 		}
 	}
 
-	for (int i = 0; i < CPU_FB_H; i++)
+	if (dirty)
 	{
-		for (int j = 0; j < CPU_FB_W; j++)
-		{
-			SDL_Rect r =
-			{
-				i * scr->size,
-				j * scr->size,
-				scr->size,
-				scr->size,
-			};
-
-			struct nk_color c = byte_to_color(mem[i + CPU_FB_H * j]);
-
-			SDL_SetRenderDrawColor(scr->r, c.r, c.g, c.b, c.a);
-			SDL_RenderFillRect(scr->r, &r);
-		}
+		SDL_RenderClear(scr->r);
+		SDL_RenderCopy(scr->r, scr->tex, NULL, NULL);
+		SDL_RenderPresent(scr->r);	
 	}
 
-	SDL_RenderPresent(scr->r);	
+	return false;
 }
+
+
+#ifndef NO_PTHREAD
+
+void *screen_thread(uint8_t *mem)
+{
+	sdl_screen_t scr = new_sdl_screen(8);
+	while (true)
+	{
+		if (sdl_screen(&scr, mem, true))
+			break;
+	}
+	free_sdl_screen(&scr);
+
+	exit(0);
+
+	return NULL;
+}
+
+void start_screen_thread(uint8_t *mem)
+{
+	pthread_t thread;
+	pthread_create(&thread, NULL, (void *(*)(void *))&screen_thread, mem);
+}
+
+#endif
diff --git a/screen.h b/screen.h
index 4128005..75dc8cf 100644
--- a/screen.h
+++ b/screen.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <stdint.h>
+#include <stdbool.h>
 
 #ifndef SCREEN_ONLY_SDL
 
@@ -25,9 +26,16 @@
 {
 	SDL_Window *win;
 	SDL_Renderer *r;
+	SDL_Texture *tex;
 	uint8_t size;
 } sdl_screen_t;
 
 // draw the CPU screen
 sdl_screen_t new_sdl_screen(uint8_t size);
-void sdl_screen(sdl_screen_t *scr, uint8_t *mem);
+void free_sdl_screen(sdl_screen_t *scr);
+bool sdl_screen(sdl_screen_t *scr, uint8_t *mem, bool dirty);
+
+#ifndef NO_PTHREAD
+void *screen_thread(uint8_t *mem);
+void start_screen_thread(uint8_t *mem);
+#endif