Merge branch 'alternate_multithreaded'

This commit is contained in:
2025-09-10 08:35:40 -05:00
5 changed files with 448 additions and 181 deletions

View File

@@ -1,9 +1,10 @@
#pragma once
#include <algorithm>
#include <cfloat>
#include <cstdlib>
#include <vector>
#include "util.h"
#include "sync.h"
#include "rand.h"
@@ -12,21 +13,31 @@ using namespace std;
namespace genetic {
template <class T> struct Array;
template <class T> struct Stats;
template <class T> struct Strategy;
struct CellTracker;
template <class T> Stats<T> run(Strategy<T>);
template <class T> T run(Strategy<T>);
template <class T> struct Strategy {
// Number of worker threads that will be evaluating cell fitness
int num_threads;
int batch_size; // Number of cells a worker thread tries to work on in a row
// before accessing/locking the work queue again.
int num_cells; // Size of the population pool
int num_generations; // Number of times (epochs) to run the algorithm
// Period of print statements (in seconds)
float stats_print_period_s;
// Size of the population pool per sim thread
int num_cells_per_thread;
// Number of times (epochs) to run the algorithm
int num_generations;
// Each thread will integrate the best globally performing cell
bool share_breakthroughs;
// How many generations to explore before resyncing with the global best
int share_breakthrough_gen_period;
bool test_all; // Sets whether or not every cell's fitness is evaluated every
// generation
float test_chance; // Chance to test any given cell's fitness. Relevant only
@@ -59,10 +70,16 @@ template <class T> struct Strategy {
};
template<class T> struct Stats {
std::vector<T> best_cell;
std::vector<float> best_cell_fitness;
TimeSpan setup_time;
TimeSpan run_time;
DynArray<T> best_cells;
DynArray<float> best_cell_fitness;
int gen;
bool done;
DynArray<TimeSpan> gen_time;
DynArray<TimeSpan> crossover_time;
DynArray<TimeSpan> mutate_time;
DynArray<TimeSpan> fitness_time;
DynArray<TimeSpan> sorting_time;
Mutex m;
};
struct CellTracker {
@@ -70,150 +87,81 @@ struct CellTracker {
int cellid;
};
template <class T> struct Array {
T *data;
int len;
T &operator[](int i) { return data[i]; }
};
template <class T> Array<T> make_array(int len) {
return {
.data = (T*)malloc(sizeof(T)*len),
.len = len
};
}
template<class T>
struct MutateJob {
T* cell;
};
template<class T>
struct CrossoverJob {
Array<T*> parents;
Array<T*> children;
};
template<class T>
struct FitnessJob {
T* cell;
CellTracker* track;
};
enum class JobType {
MUTATE,
CROSSOVER,
FITNESS
};
template<class T>
union Job {
MutateJob<T> m;
CrossoverJob<T> c;
FitnessJob<T> f;
};
// Yes. I am aware of variant
// For some reason I like this better
template<class T>
struct TaggedJob {
Job<T> data;
JobType type;
};
template<class T>
struct WorkQueue {
Array<TaggedJob<T>> jobs;
int read_i, write_i, batch_size;
bool done_writing, work_complete, stop; // These catch some edge conditions
Mutex m;
ConditionVar done;
ConditionVar jobs_ready;
};
template<class T>
struct WorkerThreadArgs {
WorkQueue<T> &q;
Strategy<T> &s;
Strategy<T> strat;
Array<T> cells;
Array<CellTracker> trackers;
Stats<T> *stats;
Mutex m;
float *best_global_score;
T* best_global_cell;
};
template<class T> WorkQueue<T> make_work_queue(int len, int batch_size);
template<class T> bool tryget_job_batch(WorkQueue<T> &q, int len, Array<TaggedJob<T>>* out_batch, bool* out_batch_is_end);
template<class T> T* _cellp(Array<T> cells, CellTracker tracker) { return &cells[tracker.cellid]; }
template<class T>
DWORD worker(LPVOID args);
template <class T> DWORD worker(LPVOID args) {
// Unpack everything...
WorkerThreadArgs<T>* worker_args = static_cast<WorkerThreadArgs<T>*>(args);
Strategy<T> strat = worker_args->strat;
Array<T> cells = worker_args->cells;
Array<CellTracker> trackers = worker_args->trackers;
Stats<T> &stats = *worker_args->stats;
float* best_global_score = worker_args->best_global_score;
T* best_global_cell = worker_args->best_global_cell;
Mutex best_m = worker_args->m;
template <class T> Stats<T> run(Strategy<T> strat) {
Stats<T> stats;
// Prepare crossover operations as these will be the same every time except
// for the exact cell pointers
int npar = strat.crossover_parent_num;
int nchild = strat.crossover_children_num;
Array<T*> parents = make_array<T*>(npar);
Array<T*> children = make_array<T*>(nchild);
// ************* SETUP **************
TimeSpan start_setup = now();
bool gt = strat.higher_fitness_is_better; // Writing strat.higher... is annoying
// Create cells
Array<T> cells = make_array<T>(strat.num_cells);
for (int i = 0; i < cells.len; i++) cells[i] = strat.make_default_cell();
// printf("Core: %d\n", get_affinity());
// Create cell trackers
Array<CellTracker> trackers = make_array<CellTracker>(strat.num_cells);
for (int i = 0; i < trackers.len; i++) trackers[i] = { .score=0, .cellid=i };
TimeSpan start, diff, gen_start;
while(stats.gen < strat.num_generations) {
gen_start = now();
// Create work queue
// Worst case size is every cell mutated, crossed, and evaluated...? Not quite, but 3x should be upper bound
WorkQueue<T> q = make_work_queue<T>(3*strat.num_cells, strat.batch_size);
WorkerThreadArgs<T> args = {q, strat};
// Create worker threads
Thread *threads = (Thread*)malloc(sizeof(Thread*)*strat.num_threads);
for (int i = 0; i < strat.num_threads; i++) {
threads[i] = make_thread(worker<T>, &args);
}
stats.setup_time = now() - start_setup;
// *********** ALGORITHM ************
TimeSpan start_algo = now();
for (int gen = 0; gen < strat.num_generations; gen++) {
// Reset work queue
lock(q.m);
q.read_i = 0;
q.write_i = 0;
q.work_complete = false;
q.done_writing = false;
unlock(q.m);
// 1. mutate
for (int i = 0; i < trackers.len; i++) {
if (abs(norm_rand(strat.rand_seed)) < strat.mutation_chance) {
MutateJob<T> mj = {&cells[trackers[i].cellid]};
TaggedJob<T> job;
job.data.m = mj;
job.type=JobType::MUTATE;
q.jobs[q.write_i++] = job;
// 0. Share/Integrate global breakthrough
if (strat.share_breakthroughs && (stats.gen + get_affinity()) % strat.share_breakthrough_gen_period) {
lock(best_m);
if (better(gt, front(trackers).score, *best_global_score) != *best_global_score) {
// Share
*best_global_cell = *_cellp(cells, trackers[0]);
*best_global_score = trackers[0].score;
} else {
// Integrate
*_cellp(cells, trackers[0]) = *best_global_cell;
trackers[0].score = *best_global_score;
}
unlock(best_m);
}
wake_all(q.jobs_ready); // There are available jobs for the worker threads!
// 2. crossover
// 1. crossover
start = now();
if (strat.enable_crossover) {
int npar = strat.crossover_parent_num;
int nchild = strat.crossover_children_num;
int parent_end = npar;
int child_begin = trackers.len-nchild;
while (parent_end <= child_begin) {
// TODO: Variable size arrays please. This is rediculous.
Array<T*> parents = make_array<T*>(npar);
Array<T*> children = make_array<T*>(nchild);
// Get pointers to all the parent cells
for (int i = parent_end-npar; i < parent_end; i++) {
parents[i - (parent_end-npar)] = &cells[trackers[i].cellid];
T* cell = _cellp(cells, trackers[i]);
assert(cell != NULL);
parents[i - (parent_end-npar)] = cell;
}
// Get pointers to all the child cells (these will be overwritten)
for (int i = child_begin; i < child_begin+nchild; i++) {
children[i-child_begin] = &cells[trackers[i].cellid];
T* cell = _cellp(cells, trackers[i]);
assert(cell != NULL);
children[i-child_begin] = cell;
}
CrossoverJob<T> cj = {parents, children};
TaggedJob<T> job;
@@ -223,10 +171,25 @@ template <class T> Stats<T> run(Strategy<T> strat) {
parent_end += strat.crossover_parent_stride;
child_begin -= nchild;
}
wake_all(q.jobs_ready); // There are available jobs for the worker threads!
}
lock(stats.m);
append(stats.crossover_time, now() - start);
unlock(stats.m);
// 2. mutate
start = now();
for (int i = 0; i < trackers.len; i++) {
if (abs(norm_rand(strat.rand_seed)) < strat.mutation_chance) {
strat.mutate(cells[trackers[i].cellid]);
}
}
lock(stats.m);
append(stats.mutate_time, now() - start);
unlock(stats.m);
// 3. evaluate
start = now();
if (strat.test_all) {
for (int i = 0; i < trackers.len; i++) {
FitnessJob<T> fj = {&cells[trackers[i].cellid], &trackers[i]};
@@ -251,29 +214,147 @@ template <class T> Stats<T> run(Strategy<T> strat) {
q.done_writing = true;
unlock(q.m);
}
wake_all(q.jobs_ready);
// Wait until the work is finished
lock(q.m);
if (!q.work_complete)
wait(q.done, q.m, infinite_ts);
unlock(q.m);
lock(stats.m);
append(stats.fitness_time, now() - start);
unlock(stats.m);
// 4. sort
std::sort(&trackers[0], &trackers[trackers.len-1], [strat](CellTracker &a, CellTracker &b){ return strat.higher_fitness_is_better ? a.score > b.score : a.score < b.score; });
start = now();
std::sort(&trackers[0], &trackers[trackers.len-1], [strat](CellTracker &a, CellTracker &b){ return better(strat.higher_fitness_is_better, a.score, b.score) == a.score; });
lock(stats.m);
append(stats.sorting_time, now() - start);
printf("Gen: %d, Best Score: %f\n", gen, trackers[0].score);
stats.best_cell.push_back(cells[trackers[0].cellid]);
stats.best_cell_fitness.push_back(trackers[0].score);
append(stats.best_cells, cells[trackers[0].cellid]);
append(stats.best_cell_fitness, trackers[0].score);
append(stats.gen_time, now() - gen_start);
stats.gen++;
unlock(stats.m);
}
stats.done = true;
return 0;
}
template <class T> T run(Strategy<T> strat) {
Array<Stats<T>> stats = make_array<Stats<T>>(strat.num_threads);
Array<Thread> threads = make_array<Thread>(strat.num_threads);
Array<WorkerThreadArgs<T>> args = make_array<WorkerThreadArgs<T>>(strat.num_threads);
float best_global_score = strat.higher_fitness_is_better ? FLT_MIN : FLT_MAX;
T best_global_cell;
allow_all_processors();
set_affinity(0);
for (int i = 0; i < strat.num_threads; i++) {
stats[i] = {
.best_cells=make_dynarray<T>(strat.num_generations),
.best_cell_fitness=make_dynarray<float>(strat.num_generations),
.gen_time=make_dynarray<TimeSpan>(strat.num_generations),
.crossover_time=make_dynarray<TimeSpan>(strat.num_generations),
.mutate_time=make_dynarray<TimeSpan>(strat.num_generations),
.fitness_time=make_dynarray<TimeSpan>(strat.num_generations),
.sorting_time=make_dynarray<TimeSpan>(strat.num_generations),
.m=make_mutex()
};
Array<T> cells = make_array<T>(strat.num_threads*strat.num_cells_per_thread);
Array<CellTracker> trackers = make_array<CellTracker>(strat.num_cells_per_thread);
for (int i = 0; i < strat.num_cells_per_thread; i++) {
cells[i] = strat.make_default_cell();
trackers[i] = {0, i};
}
args[i].strat=strat;
args[i].cells=cells;
args[i].trackers=trackers;
args[i].stats=&stats[i];
args[i].best_global_score=&best_global_score;
args[i].best_global_cell=&best_global_cell;
args[i].m = make_mutex();
threads[i] = make_thread(worker<T>, &args[i], i+1);
}
q.stop = true;
wake_all(q.jobs_ready);
// TODO: join all threads
// We are the stats thread
bool complete = false;
while (!complete) {
sleep(from_s(strat.stats_print_period_s));
// TODO: There's some data freeing that should really be done here
stats.run_time = now() - start_algo;
return stats;
printf("**********************\n");
float g_avg_gen_time = 0;
float g_avg_crossover_time = 0;
float g_avg_mutate_time = 0;
float g_avg_fitness_time = 0;
float g_avg_sorting_time = 0;
float g_avg_overhead_time = 0;
float g_progress_per = 0;
float g_best_fitness = strat.higher_fitness_is_better ? FLT_MIN : FLT_MAX;
complete = true;
for (int i = 0; i < stats.len; i++) {
lock(stats[i].m);
complete &= stats[i].done;
int end = stats[i].gen_time.end-1;
float gen_time = to_s(stats[i].gen_time[end]);
float crossover_time = to_s(stats[i].crossover_time[end]);
float mutate_time = to_s(stats[i].mutate_time[end]);
float fitness_time = to_s(stats[i].fitness_time[end]);
float sorting_time = to_s(stats[i].sorting_time[end]);
float progress_per = static_cast<float>(stats[i].gen) / static_cast<float>(strat.num_generations) * 100;
float best_score = back(stats[i].best_cell_fitness);
float overhead = max(0, gen_time - (crossover_time + mutate_time + fitness_time + sorting_time));
float overhead_per = overhead / gen_time * 100;
g_avg_gen_time += gen_time;
g_avg_crossover_time += crossover_time;
g_avg_mutate_time += mutate_time;
g_avg_fitness_time += fitness_time;
g_avg_sorting_time += sorting_time;
g_progress_per += progress_per;
g_best_fitness = better(strat.higher_fitness_is_better, best_score, g_best_fitness);
g_avg_overhead_time += overhead;
printf("%d, Progress %d/%d, Top: %.5e, Overhead Per: %.4f%%, Gen: %.4f, Overhead: %.4f, Cross: %.4f (s), Mutate: %.4f (s), Fitness: %.4f (s), Sorting: %.4f (s)\n", i, stats[i].gen, strat.num_generations, best_score, overhead_per, gen_time, overhead, crossover_time, mutate_time, fitness_time, sorting_time);
unlock(stats[i].m);
}
g_avg_gen_time /= stats.len;
g_avg_crossover_time /= stats.len;
g_avg_mutate_time /= stats.len;
g_avg_fitness_time /= stats.len;
g_avg_sorting_time /= stats.len;
g_progress_per /= stats.len;
g_avg_overhead_time /= stats.len;
float g_avg_overhead_per = g_avg_overhead_time / g_avg_gen_time * 100;
printf("GLOBAL, Progress %.1f%%, Top: %.5e, Overhead Per: %.4f%%, Gen: %.4f, Overhead: %.4f, Cross: %.4f (s), Mutate: %.4f (s), Fitness: %.4f (s), Sorting: %.4f (s)\n", g_progress_per, g_best_fitness, g_avg_overhead_per, g_avg_gen_time, g_avg_overhead_time, g_avg_crossover_time, g_avg_mutate_time, g_avg_fitness_time, g_avg_sorting_time);
if (complete) break;
}
for (int i = 0; i < threads.len; i++) {
join(threads[i]);
}
T best_cell;
// TODO: bad
float best_score = strat.higher_fitness_is_better ? FLT_MIN : FLT_MAX;
for (int i = 0; i < stats.len; i++) {
float score = back(stats[i].best_cell_fitness);
if (strat.higher_fitness_is_better ? score > best_score : score < best_score) {
best_cell = back(stats[i].best_cells);
best_score = score;
}
}
return best_cell;
}
template<class T> WorkQueue<T> make_work_queue(int len, int batch_size) {