more progress on drafting the worker thread model, get job batch func, etc...

2025-08-15 16:09:33 -05:00
parent 65c7ea743b
commit edda3761d1
2 changed files with 166 additions and 112 deletions
--- a/inc/genetic.h
+++ b/inc/genetic.h
@@ -1,15 +1,19 @@
 #include <vector>
 #include <span>
 namespace genetic {
 template <class T> struct ReadonlySpan;
 template <class T> struct Span;
 template <class T> struct Stats;
 template <class T> struct Strategy;
 template <class T> Stats<T> run(Strategy<T>);
 template <class T> struct Strategy {
  int num_threads; // Number of worker threads that will be evaluating cell
                   // fitness.
-  int num_retries; // Number of times worker threads will try to grab work pool
+  int batch_size;  // Number of cells a worker thread tries to work on in a row
-                   // lock before sleeping
+                   // before accessing/locking the work queue again.
  int batch_size;  // Number of cells a worker thread tries to evaluate in a row
                   // before locking the pool again.
  int num_cells;   // Size of the population pool
  int num_generations; // Number of times (epochs) to run the algorithm
  bool test_all; // Sets whether or not every cell is tested every generation
@@ -21,14 +25,17 @@ template <class T> struct Strategy {
  float crossover_mutation_chance; // Chance to mutate a child cell
  int crossover_parent_num;        // Number of unique high-scoring parents in a
                                   // crossover call.
-  int crossover_children_num;      // Number of children produced in a crossover
+  int crossover_children_num;      // Number of children to expect the user to
-  bool enable_mutation;  // Cells may be mutated before fitness evaluation
+                                   // produce in the crossover function.
  bool enable_mutation;            // Cells may be mutated
                                   // before fitness evaluation
  float mutation_chance; // Chance to mutate cells before fitness evaluation
  // User defined functions
  T (*make_default_cell)();
-  void (*mutate)(T &cell);
+  void (*mutate)(T &cell_to_modify);
-  void (*crossover)(const std::span<T> &parents, std::span<T> &out_children);
+  void (*crossover)(const ReadonlySpan<T> &parents,
                    const Span<T> &out_children);
  float (*fitness)(const T &cell);
 };
@@ -37,6 +44,18 @@ template <class T> struct Stats {
  std::vector<float> average_fitness;
 };
-template <class T> Stats<T> run(Strategy<T>);
+template <class T> struct ReadonlySpan {
  T *_data;
  int len;
  const T &operator[](int i);
 };
 template <class T> struct Span {
  T *_data;
  int len;
  T &operator[](int i);
 };
 } // namespace genetic
--- a/src/genetic.cpp
+++ b/src/genetic.cpp
@@ -1,125 +1,139 @@
 #include "genetic.h"
 #include "pthread.h"
-#include <queue>
+#include <optional>
 #include <variant>
 #include <vector>
 #define NUM_QUEUE_RETRIES 10
 using namespace std;
 // std::visit/std::variant overload pattern
 // See:
 // https://www.modernescpp.com/index.php/visiting-a-std-variant-with-the-overload-pattern/
 // You don't have to understand this, just use it :)
 template <typename... Ts> struct overload : Ts... {
  using Ts::operator()...;
 };
 template <class... Ts> overload(Ts...) -> overload<Ts...>;
 namespace genetic {
 template <class T> struct CellEntry {
  float score;
  T cell;
  bool stale;
 };
-template <class T> struct WorkEntry {
+template <class T> struct CrossoverJob {
-  const CellEntry<T> &cur;
+  const ReadonlySpan<T> &parents;
-  float &score;
+  const Span<T> &children_out;
 };
 template <class T> struct FitnessJob {
  const T &cell;
  float &result_out;
 };
 template <class T> struct WorkQueue {
-  std::vector<WorkEntry<T>> jobs;
+  variant<CrossoverJob<T>, FitnessJob<T>> *jobs;
-  int i;
+  int len;
  int read_i;
  int write_i;
  bool done_writing;
  pthread_mutex_t data_mutex;
  pthread_mutex_t gen_complete_mutex;
  pthread_mutex_t jobs_available_mutex;
  pthread_cond_t gen_complete_cond;
  pthread_cond_t jobs_available_cond;
 };
-static pthread_mutex_t data_mutex = PTHREAD_MUTEX_INITIALIZER;
+template <class T> WorkQueue<T> make_work_queue(int len) {
-
+  return {.jobs = (variant<FitnessJob<T>, CrossoverJob<T>> *)malloc(
-static pthread_mutex_t ready_mutex = PTHREAD_MUTEX_INITIALIZER;
+              sizeof(variant<FitnessJob<T>, CrossoverJob<T>>) * len),
-static pthread_cond_t ready_cond = PTHREAD_COND_INITIALIZER;
+          .len = len,
-
+          .read_i = 0,
-static pthread_mutex_t gen_complete_mutex = PTHREAD_MUTEX_INITIALIZER;
+          .write_i = 0,
-static pthread_cond_t gen_complete_cond = PTHREAD_COND_INITIALIZER;
+          .done_writing = false,
-
+          .data_mutex = PTHREAD_MUTEX_INITIALIZER,
-static pthread_mutex_t run_complete_mutex = PTHREAD_MUTEX_INITIALIZER;
+          .gen_complete_mutex = PTHREAD_MUTEX_INITIALIZER,
-static pthread_cond_t run_complete_cond = PTHREAD_COND_INITIALIZER;
+          .jobs_available_mutex = PTHREAD_MUTEX_INITIALIZER,
-
+          .gen_complete_cond = PTHREAD_COND_INITIALIZER,
-/* Thoughts on this approach
+          .jobs_available_cond = PTHREAD_COND_INITIALIZER};
- * The ideal implementation of a worker thread has them operating at maximum
+}
- * load with as little synchronization overhead as possible. i.e. The ideal
+
- * worker thread
+template <class T> struct JobBatch {
- *     1. Never waits for new work
+  ReadonlySpan<variant<CrossoverJob<T>, FitnessJob<T>>> jobs;
- *     2. Never spends time synchronizing with other worker threads
+  bool gen_complete;
- *
+};
- * Never is impossible, but we want to get as close as we can.
+
- *
+template <class T>
- * There are two extreme situations to consider
+optional<JobBatch<T>> get_job_batch(WorkQueue<T> &queue, int batch_size,
- *     1. Fitness functions with highly variable computation times
+                                    bool *stop_flag) {
- *     2. Fitness functions with identical computation times.
+  while (true) {
- *
+    for (int i = 0; i < NUM_QUEUE_RETRIES; i++) {
- * Most applications that use this library will fall into the second
+      if (queue.read_i < queue.write_i &&
- * category.
+          pthread_mutex_trylock(&queue.data_mutex)) {
- *
+        JobBatch<T> res;
- * In the highly-variable computation time case, it's useful for worker threads
+        res.jobs._data = &queue._jobs[queue.read_i];
- * to operate on 1 work entry at a time. Imagine a scenario with 2 threads, each
+        int span_size = min(batch_size, queue.write_i - queue.read_i);
- * of which claims half the work to do. If thread A completes all of its work
+        res.jobs.len = span_size;
- * quickly, it goes to sleep while thread B slogs away on its harder-to-compute
+
- * fitness jobs. However, if both threads only claim 1 work entry at a time,
+        queue.read_i += span_size;
- * thread A can immediately claim new jobs after it completes its current one.
+        res.gen_complete = queue.done_writing && queue.read_i == queue.write_i;
- * Thread B can toil away, but little time is lost since thread A remains
+
- * productive.
+        pthread_mutex_unlock(&queue.data_mutex);
- *
+        return res;
- * In the highly consistent computation time case, it's ideal for each
+      }
- * thread to claim an equal share of the jobs (as this minimizes time spent
+    }
- * synchronizing access to the job pool). Give each thread its set of work once
+    pthread_mutex_lock(&queue.jobs_available_mutex);
- * and let them have at it instead of each thread constantly locking/waiting
+    pthread_cond_wait(queue.jobs_available_cond, &queue.jobs_available_mutex);
- * on the job queue.
+    if (stop_flag)
- *
+      return {};
- * I take a hybrid approach. Users can specify a "batch size". Worker threads
+  }
- * will bite off jobs in chunks and complete them before locking
+}
- * the job pool to grab another chunk. The user should choose a batch size close
+
- * to 1 if their fitness function compute time is highly variable and closer to
+template <class T> struct WorkerThreadArgs {
- * num_cells / num_threads if computation time is consistent. Users should
+  Strategy<T> &strat;
- * experiment with a batch size that works well for their problem.
+  WorkQueue<T> &queue;
- *
+  bool *stop_flag;
- * Worth mentioning this avoiding synchronization is irrelevant once computation
+};
- * time >>> synchronization time.
+
- *
+template <class T> void *worker(void *args) {
- * There might be room for dynamic batch size modification, but I don't expect
+  WorkerThreadArgs<T> *work_args = (WorkerThreadArgs<T> *)args;
- * to pursue this feature until the library is more mature (and I've run out of
+  Strategy<T> &strat = work_args->strat;
- * cooler things to do).
+  WorkQueue<T> &queue = work_args->queue;
- *
+  bool *stop_flag = work_args->stop_flag;
- */
+
-template <class T>
+  auto JobDispatcher = overload{
-void worker(std::queue<WorkEntry<T>> &fitness_queue, int batch_size,
+      [strat](FitnessJob<T> fj) { fj.result_out = strat.fitness(fj.cell); },
-            int num_retries) {
+      [strat](CrossoverJob<T> cj) {
-  int retries = 0;
+        strat.crossover(cj.parents, cj.children_out);
-  std::vector<WorkEntry<T>> batch;
+      },
-  bool gen_is_finished;
+  };
-  while (true) {
+
-    gen_is_finished = false;
+  while (true) {
-    if (pthread_mutex_trylock(&data_mutex)) {
+    auto batch = get_job_batch(queue, strat.batch_size, stop_flag);
-      retries = 0;
+    if (!batch || *stop_flag)
-      for (int i = 0; i < batch_size; i++) {
+      return NULL;
-        if (fitness_queue.empty()) {
+
-          gen_is_finished = true;
+    // Do the actual work
-          break;
+    for (int i = 0; i < batch->jobs.len; i++) {
-        }
+      visit(JobDispatcher, batch->jobs[i]);
-        batch.push_back(fitness_queue.front());
+    }
-        fitness_queue.pop();
+
-      }
+    if (batch->gen_complete) {
-      pthread_mutex_unlock(&data_mutex);
+      pthread_cond_signal(&queue.gen_complete_cond, &queue.gen_complete_mutex);
-    } else {
+    }
-      retries++;
+  }
    }
    if (gen_is_finished) {
      pthread_cond_signal(&gen_complete_cond, &gen_complete_mutex);
    }
    if (retries > num_retries) {
      pthread_mutex_lock(&ready_mutex);
      pthread_cond_wait(&ready_cond, &ready_mutex);
      retries = 0;
    }
  }
  pthread_mutex_lock(&data_mutex);
 }
 // Definitions
 template <class T> Stats<T> run(Strategy<T> strat) {
  Stats<T> stats;
  WorkQueue<T> queue = make_work_queue<T>(strat.num_cells);
-  std::queue<WorkEntry<T>> fitness_queue;
+  vector<CellEntry<T>> cells_a, cells_b;
  std::vector<CellEntry<T>> cells_a, cells_b;
  for (int i = 0; i < strat.num_cells; i++) {
    T cell = strat.make_default_cell();
    cells_a.push_back({0, cell, true});
@@ -129,11 +143,32 @@ template <class T> Stats<T> run(Strategy<T> strat) {
  std::vector<CellEntry<T>> &cur_cells = cells_a;
  std::vector<CellEntry<T>> &next_cells = cells_b;
-  for (int i = 0; i < strat.num_generations; i++) {
+  bool stop_flag = false;
  WorkerThreadArgs<T> args = {
      .strat = strat, .queue = queue, .stop_flag = &stop_flag};
  // spawn worker threads
  pthread_t threads[strat.num_threads];
  for (int i = 0; i < strat.num_threads; i++) {
    pthread_create(&threads[i], NULL, worker<T>, (void *)args);
  }
  for (int i = 0; i < strat.num_generations; i++) {
    // generate fitness jobs
    // wait for fitness jobs to complete
    // sort cells on performance
    // generate crossover jobs
    cur_cells = cur_cells == cells_a ? cells_b : cells_a;
    next_cells = cur_cells == cells_a ? cells_b : cells_a;
  }
  // stop worker threads
  stop_flag = true;
  pthread_cond_broadcast(queue.jobs_available_cond);
  for (int i = 0; i < strat.num_threads; i++) {
    pthread_join(threads[i], NULL);
  }
 }
 } // namespace genetic