From f7e804607fba7a762872ee3c848f003216fab97c Mon Sep 17 00:00:00 2001 From: Seth Hamilton Date: Wed, 10 Sep 2025 00:46:50 -0500 Subject: [PATCH] Debugged multithreaded version. Now investigating some performance issues (not every thread is being used). This is an interesting version. --- debug.rad | 8 +++ inc/genetic.h | 176 +++++++++++++++++++++++++++++++------------------- inc/sync.h | 93 ++++++++++++++++++++++++++ inc/util.h | 7 +- src/main.cpp | 11 ++-- 5 files changed, 221 insertions(+), 74 deletions(-) diff --git a/debug.rad b/debug.rad index a3ec9c9..71af2bd 100644 --- a/debug.rad +++ b/debug.rad @@ -2,7 +2,9 @@ recent_file: path: "inc/genetic.h" recent_file: path: "inc/sync.h" +recent_file: path: "d:/os/obj/amd64fre/minkernel/crts/ucrt/src/appcrt/startup/mt/objfre/amd64/minkernel/crts/ucrt/src/appcrt/startup/abort.cpp" recent_file: path: "src/main.cpp" +recent_file: path: "d:/os/obj/amd64fre/minkernel/crts/ucrt/src/appcrt/startup/mt/objfre/amd64/minkernel/crts/ucrt/src/appcrt/startup/assert.cpp" recent_file: path: "../../../../../Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.44.35207/include/vector" recent_file: path: "d:/os/obj/amd64fre/minkernel/crts/ucrt/src/appcrt/misc/mt/objfre/amd64/minkernel/crts/ucrt/src/appcrt/misc/invalid_parameter.cpp" recent_file: path: "../../../../../Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.44.35207/include/xmemory" @@ -14,4 +16,10 @@ target: working_directory: bin label: main enabled: 1 + arguments: 1 +} +breakpoint: +{ + source_location: "inc/genetic.h:292:1" + hit_count: 1 } diff --git a/inc/genetic.h b/inc/genetic.h index f97a741..e08b88c 100644 --- a/inc/genetic.h +++ b/inc/genetic.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include "util.h" @@ -30,6 +31,12 @@ template struct Strategy { // Number of times (epochs) to run the algorithm int num_generations; + // Each thread will integrate the best globally performing cell + bool share_breakthroughs; + + // How many generations to explore before resyncing with the global best + int share_breakthrough_gen_period; + bool test_all; // Sets whether or not every cell's fitness is evaluated every // generation float test_chance; // Chance to test any given cell's fitness. Relevant only @@ -66,16 +73,11 @@ template struct Stats { DynArray best_cell_fitness; int gen; bool done; - TimeSpan start, end; - TimeSpan total_crossover_time; - int total_crossovers; - TimeSpan total_mutate_time; - int total_mutates; - TimeSpan total_fitness_time; - int total_evaluations; - TimeSpan total_sorting_time; - int total_sorts; - + DynArray gen_time; + DynArray crossover_time; + DynArray mutate_time; + DynArray fitness_time; + DynArray sorting_time; Mutex m; }; @@ -90,6 +92,10 @@ struct WorkerThreadArgs { Array cells; Array trackers; Stats *stats; + + Mutex m; + float *best_global_score; + T* best_global_cell; }; template T* _cellp(Array cells, CellTracker tracker) { return &cells[tracker.cellid]; } @@ -101,6 +107,9 @@ template DWORD worker(LPVOID args) { Array cells = worker_args->cells; Array trackers = worker_args->trackers; Stats &stats = *worker_args->stats; + float* best_global_score = worker_args->best_global_score; + T* best_global_cell = worker_args->best_global_cell; + Mutex best_m = worker_args->m; // Prepare crossover operations as these will be the same every time except // for the exact cell pointers @@ -109,9 +118,29 @@ template DWORD worker(LPVOID args) { Array parents = make_array(npar); Array children = make_array(nchild); - TimeSpan start_algo = now(); - TimeSpan start; + bool gt = strat.higher_fitness_is_better; // Writing strat.higher... is annoying + + // printf("Core: %d\n", get_affinity()); + + TimeSpan start, diff, gen_start; while(stats.gen < strat.num_generations) { + gen_start = now(); + + // 0. Share/Integrate global breakthrough + if (strat.share_breakthroughs && (stats.gen + get_affinity()) % strat.share_breakthrough_gen_period) { + lock(best_m); + if (better(gt, front(trackers).score, *best_global_score) != *best_global_score) { + // Share + *best_global_cell = *_cellp(cells, trackers[0]); + *best_global_score = trackers[0].score; + + } else { + // Integrate + *_cellp(cells, trackers[0]) = *best_global_cell; + trackers[0].score = *best_global_score; + } + unlock(best_m); + } // 1. crossover start = now(); @@ -119,14 +148,19 @@ template DWORD worker(LPVOID args) { int parent_end = npar; int child_begin = trackers.len-nchild; while (parent_end <= child_begin) { + // Get pointers to all the parent cells for (int i = parent_end-npar; i < parent_end; i++) { - parents[i - (parent_end-npar)] = _cellp(cells, trackers[i]); + T* cell = _cellp(cells, trackers[i]); + assert(cell != NULL); + parents[i - (parent_end-npar)] = cell; } // Get pointers to all the child cells (these will be overwritten) for (int i = child_begin; i < child_begin+nchild; i++) { - children[i-child_begin] = _cellp(cells, trackers[i]); + T* cell = _cellp(cells, trackers[i]); + assert(cell != NULL); + children[i-child_begin] = cell; } strat.crossover(parents, children); parent_end += strat.crossover_parent_stride; @@ -134,8 +168,7 @@ template DWORD worker(LPVOID args) { } } lock(stats.m); - stats.total_crossover_time = stats.total_crossover_time + (now() - start); - stats.total_crossovers++; + append(stats.crossover_time, now() - start); unlock(stats.m); @@ -147,8 +180,7 @@ template DWORD worker(LPVOID args) { } } lock(stats.m); - stats.total_mutate_time = stats.total_mutate_time + (now() - start); - stats.total_mutates++; + append(stats.mutate_time, now() - start); unlock(stats.m); // 3. evaluate @@ -165,67 +197,63 @@ template DWORD worker(LPVOID args) { } } lock(stats.m); - stats.total_fitness_time = stats.total_fitness_time + (now() - start); - stats.total_evaluations++; + append(stats.fitness_time, now() - start); unlock(stats.m); // 4. sort start = now(); - std::sort(&trackers[0], &trackers[trackers.len-1], [strat](CellTracker &a, CellTracker &b){ return strat.higher_fitness_is_better ? a.score > b.score : a.score < b.score; }); + std::sort(&trackers[0], &trackers[trackers.len-1], [strat](CellTracker &a, CellTracker &b){ return better(strat.higher_fitness_is_better, a.score, b.score) == a.score; }); lock(stats.m); - stats.total_sorting_time = stats.total_sorting_time + (now() - start); - stats.total_sorts++; + append(stats.sorting_time, now() - start); append(stats.best_cells, cells[trackers[0].cellid]); append(stats.best_cell_fitness, trackers[0].score); + append(stats.gen_time, now() - gen_start); stats.gen++; unlock(stats.m); } stats.done = true; - stats.end = now(); return 0; } template T run(Strategy strat) { Array> stats = make_array>(strat.num_threads); Array threads = make_array(strat.num_threads); - Array cells = make_array(strat.num_threads*strat.num_cells_per_thread); - Array trackers = make_array(cells.len); - Array> args = make_array>(strat.num_threads); - for (int i = 0; i < cells.len; i++) { - cells[i] = strat.make_default_cell(); - trackers[i] = {0, i}; - } + float best_global_score = strat.higher_fitness_is_better ? FLT_MIN : FLT_MAX; + T best_global_cell; + + allow_all_processors(); + set_affinity(0); for (int i = 0; i < strat.num_threads; i++) { stats[i] = { .best_cells=make_dynarray(strat.num_generations), .best_cell_fitness=make_dynarray(strat.num_generations), - .gen=0, - .done=false, - .start=from_s(0), - .end=from_s(0), - .total_crossover_time=from_s(0), - .total_crossovers=0, - .total_mutate_time=from_s(0), - .total_mutates=0, - .total_fitness_time=from_s(0), - .total_evaluations=0, - .total_sorting_time=from_s(0), - .total_sorts=0, + .gen_time=make_dynarray(strat.num_generations), + .crossover_time=make_dynarray(strat.num_generations), + .mutate_time=make_dynarray(strat.num_generations), + .fitness_time=make_dynarray(strat.num_generations), + .sorting_time=make_dynarray(strat.num_generations), .m=make_mutex() }; - Array tcells = { &cells[i*strat.num_cells_per_thread], strat.num_cells_per_thread }; - Array ttrackers = { &trackers[i*strat.num_cells_per_thread], strat.num_cells_per_thread }; + Array cells = make_array(strat.num_threads*strat.num_cells_per_thread); + Array trackers = make_array(strat.num_cells_per_thread); + for (int i = 0; i < strat.num_cells_per_thread; i++) { + cells[i] = strat.make_default_cell(); + trackers[i] = {0, i}; + } args[i].strat=strat; - args[i].cells=tcells; - args[i].trackers=ttrackers; + args[i].cells=cells; + args[i].trackers=trackers; args[i].stats=&stats[i]; + args[i].best_global_score=&best_global_score; + args[i].best_global_cell=&best_global_cell; + args[i].m = make_mutex(); - threads[i] = make_thread(worker, &args[i]); + threads[i] = make_thread(worker, &args[i], i+1); } // We are the stats thread @@ -234,12 +262,14 @@ template T run(Strategy strat) { sleep(from_s(strat.stats_print_period_s)); printf("**********************\n"); + float g_avg_gen_time = 0; float g_avg_crossover_time = 0; float g_avg_mutate_time = 0; float g_avg_fitness_time = 0; float g_avg_sorting_time = 0; + float g_avg_overhead_time = 0; float g_progress_per = 0; - float g_best_fitness = strat.higher_fitness_is_better ? 0.0 : 999999999999999999.9; + float g_best_fitness = strat.higher_fitness_is_better ? FLT_MIN : FLT_MAX; complete = true; @@ -247,43 +277,57 @@ template T run(Strategy strat) { lock(stats[i].m); complete &= stats[i].done; - float avg_crossover_time = to_s(stats[i].total_crossover_time) / static_cast(stats[i].total_crossovers); - - float avg_mutate_time = to_s(stats[i].total_mutate_time) / static_cast(stats[i].total_mutates); - - float avg_fitness_time = to_s(stats[i].total_fitness_time) / static_cast(stats[i].total_evaluations); - - float avg_sorting_time = to_s(stats[i].total_sorting_time) / static_cast(stats[i].total_sorts); + int end = stats[i].gen_time.end-1; + float gen_time = to_s(stats[i].gen_time[end]); + float crossover_time = to_s(stats[i].crossover_time[end]); + float mutate_time = to_s(stats[i].mutate_time[end]); + float fitness_time = to_s(stats[i].fitness_time[end]); + float sorting_time = to_s(stats[i].sorting_time[end]); float progress_per = static_cast(stats[i].gen) / static_cast(strat.num_generations) * 100; - float best_score = back(stats[i].best_cell_fitness); - g_avg_crossover_time += avg_crossover_time; - g_avg_mutate_time += avg_mutate_time; - g_avg_fitness_time += avg_fitness_time; - g_avg_sorting_time += avg_sorting_time; - g_progress_per += progress_per; - g_best_fitness = strat.higher_fitness_is_better ? max(best_score, g_best_fitness) : min(best_score, g_best_fitness); + float overhead = max(0, gen_time - (crossover_time + mutate_time + fitness_time + sorting_time)); - printf("THREAD %d, Progress %.1f\%, Top Score %.5e, Cross %.5f (s), Mutate: %.5f (s), Fitness: %.5f (s), Sorting: %.5f (s)\n", i, progress_per, best_score, avg_crossover_time, avg_mutate_time, avg_fitness_time, avg_sorting_time); + float overhead_per = overhead / gen_time * 100; + + g_avg_gen_time += gen_time; + g_avg_crossover_time += crossover_time; + g_avg_mutate_time += mutate_time; + g_avg_fitness_time += fitness_time; + g_avg_sorting_time += sorting_time; + g_progress_per += progress_per; + g_best_fitness = better(strat.higher_fitness_is_better, best_score, g_best_fitness); + + g_avg_overhead_time += overhead; + + printf("%d, Progress %d/%d, Top: %.5e, Overhead Per: %.4f%%, Gen: %.4f, Overhead: %.4f, Cross: %.4f (s), Mutate: %.4f (s), Fitness: %.4f (s), Sorting: %.4f (s)\n", i, stats[i].gen, strat.num_generations, best_score, overhead_per, gen_time, overhead, crossover_time, mutate_time, fitness_time, sorting_time); unlock(stats[i].m); } + g_avg_gen_time /= stats.len; g_avg_crossover_time /= stats.len; g_avg_mutate_time /= stats.len; g_avg_fitness_time /= stats.len; g_avg_sorting_time /= stats.len; g_progress_per /= stats.len; - printf("OVERALL, Progress %.1f\%, Top Score: %.5e, Cross %.5f (s), Mutate: %.5f (s), Fitness: %.5f (s), Sorting: %.5f (s)\n", g_progress_per, g_best_fitness, g_avg_crossover_time, g_avg_mutate_time, g_avg_fitness_time, g_avg_sorting_time); + g_avg_overhead_time /= stats.len; + + float g_avg_overhead_per = g_avg_overhead_time / g_avg_gen_time * 100; + + printf("GLOBAL, Progress %.1f%%, Top: %.5e, Overhead Per: %.4f%%, Gen: %.4f, Overhead: %.4f, Cross: %.4f (s), Mutate: %.4f (s), Fitness: %.4f (s), Sorting: %.4f (s)\n", g_progress_per, g_best_fitness, g_avg_overhead_per, g_avg_gen_time, g_avg_overhead_time, g_avg_crossover_time, g_avg_mutate_time, g_avg_fitness_time, g_avg_sorting_time); if (complete) break; } + for (int i = 0; i < threads.len; i++) { + join(threads[i]); + } + T best_cell; // TODO: bad - float best_score = strat.higher_fitness_is_better ? 0.0 : 999999999999999999.9; + float best_score = strat.higher_fitness_is_better ? FLT_MIN : FLT_MAX; for (int i = 0; i < stats.len; i++) { float score = back(stats[i].best_cell_fitness); if (strat.higher_fitness_is_better ? score > best_score : score < best_score) { diff --git a/inc/sync.h b/inc/sync.h index 4f28a24..9da147f 100644 --- a/inc/sync.h +++ b/inc/sync.h @@ -1,5 +1,9 @@ #pragma once +#include +#include +#include + #ifdef _WIN32 #include #endif @@ -17,6 +21,14 @@ typedef LPVOID ThreadArg; const TimeSpan infinite_ts = { .QuadPart = LLONG_MAX }; +int get_num_cores() { + SYSTEM_INFO sysinfo; + GetSystemInfo(&sysinfo); + return sysinfo.dwNumberOfProcessors; +} + +const int num_cores = get_num_cores(); + LARGE_INTEGER _init_freq() { LARGE_INTEGER freq; QueryPerformanceFrequency(&freq); @@ -27,8 +39,13 @@ static LARGE_INTEGER freq = _init_freq(); #endif Thread make_thread(ThreadFunc t, ThreadArg a); +Thread make_thread(ThreadFunc t, ThreadArg a, int core_affinity); void join(Thread t); void sleep(TimeSpan ts); +void allow_all_processors(); +void set_affinity(Thread &t, int core); +void set_affinity(int core); +int get_affinity(); Mutex make_mutex(); void lock(Mutex &m); @@ -64,11 +81,60 @@ double to_hours(TimeSpan &ts); #ifdef _WIN32 +uint64_t bitmask (unsigned short n) { + if (n == 64) return -((uint64_t)1); + return (((uint64_t) 1) << n) - 1; +} + +const int tab64[64] = { + 63, 0, 58, 1, 59, 47, 53, 2, + 60, 39, 48, 27, 54, 33, 42, 3, + 61, 51, 37, 40, 49, 18, 28, 20, + 55, 30, 34, 11, 43, 14, 22, 4, + 62, 57, 46, 52, 38, 26, 32, 41, + 50, 36, 17, 19, 29, 10, 13, 21, + 56, 45, 25, 31, 35, 16, 9, 12, + 44, 24, 15, 8, 23, 7, 6, 5}; + +int log2_64 (uint64_t value) +{ + value |= value >> 1; + value |= value >> 2; + value |= value >> 4; + value |= value >> 8; + value |= value >> 16; + value |= value >> 32; + return tab64[((uint64_t)((value - (value >> 1))*0x07EDD5E59A4E28C2)) >> 58]; +} + Thread make_thread(ThreadFunc f, ThreadArg a) { DWORD tid; return CreateThread(NULL, 0, f, a, 0, &tid); } +struct DummyThreadArgs { + int core_affinity; + ThreadFunc f; + ThreadArg a; +}; + +DWORD _dummy_thread(LPVOID a) { + DummyThreadArgs *wrap = static_cast(a); + set_affinity(wrap->core_affinity); + return wrap->f(wrap->a); +} + +Thread make_thread(ThreadFunc f, ThreadArg a, int core_affinity) { + DWORD tid; + DummyThreadArgs *args = (DummyThreadArgs*)malloc(sizeof(DummyThreadArgs)); + *args = { + .core_affinity=core_affinity, + .f=f, + .a=a + }; + return CreateThread(NULL, 0, _dummy_thread, args, 0, &tid); +} + void join(Thread t) { WaitForSingleObject(t, INFINITE); } @@ -77,6 +143,33 @@ void sleep(TimeSpan ts) { Sleep(static_cast(to_ms(ts))); } +void allow_all_processors() { + Thread t = GetCurrentThread(); + DWORD affinity = bitmask(num_cores); + SetProcessAffinityMask(t, affinity); +} + +void set_affinity(Thread &t, int core) { + DWORD mask = 1 << (core % num_cores); + DWORD old = SetThreadAffinityMask(t, mask); + DWORD confirm = SetThreadAffinityMask(t, mask); + assert(old && GetLastError() != ERROR_INVALID_PARAMETER && mask == confirm); +} + +void set_affinity(int core) { + Thread cur = GetCurrentThread(); + set_affinity(cur, core); +} + +int get_affinity() { + Thread t = GetCurrentThread(); + DWORD mask = 1; + DWORD affinity = SetThreadAffinityMask(t, (DWORD_PTR)mask); + DWORD check = SetThreadAffinityMask(t, (DWORD_PTR)affinity); + assert(check == mask); + return log2_64(affinity); +} + Mutex make_mutex() { Mutex m; InitializeCriticalSection(&m); diff --git a/inc/util.h b/inc/util.h index ed8326a..fea5409 100644 --- a/inc/util.h +++ b/inc/util.h @@ -3,6 +3,7 @@ #include #define min(A, B) ((A < B) ? (A) : (B)) #define max(A, B) ((A > B) ? (A) : (B)) +#define better(GT, A, B) (GT ? max((A), (B)) : min((A), (B))) template struct Array { T *data; @@ -18,6 +19,8 @@ template Array make_array(int len) { .len=len }; } +template T back(Array &a) { return a.data[a.len-1]; } +template T front(Array &a) { return a.data[0]; } template struct DynArray { T* _data; @@ -48,6 +51,6 @@ template void append(DynArray &a, T el) { a[a.end++] = el; } -template T& back(DynArray &a) { return a._data[a.end-1]; } -template T& front(DynArray &a) { return a._data[0]; } +template T back(DynArray &a) { return a._data[a.end-1]; } +template T front(DynArray &a) { return a._data[0]; } diff --git a/src/main.cpp b/src/main.cpp index 77f1d91..cad03e7 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -41,9 +41,6 @@ void crossover(const Array*> parents, const Array *> o } } -// norm_rand can go negative. fix in genetic.cpp -// child stride doesn't make sense. Should always skip over child num - float fitness(const Array &cell) { float sum = 0; float product = 1; @@ -55,12 +52,14 @@ float fitness(const Array &cell) { } int main(int argc, char **argv) { - int num_gens = 1000; + int num_gens = 10000; Strategy> strat { - .num_threads = 1, + .num_threads = atoi(argv[1]), .stats_print_period_s = 2, - .num_cells_per_thread = 10000, + .num_cells_per_thread = 100000, .num_generations = num_gens, + .share_breakthroughs=true, + .share_breakthrough_gen_period=10, .test_all = true, .test_chance = 0.0, // doesn't matter .enable_crossover = true,