some english notes on the purpose of batch sizes and the beginning of a worker thread implementation

2025-08-10 01:15:35 -05:00
parent b4d4683f8d
commit db2272b768
2 changed files with 106 additions and 9 deletions
--- a/inc/genetic.h
+++ b/inc/genetic.h
@@ -3,13 +3,17 @@
 namespace genetic {
 template <class T> struct Strategy {
-  // The recommended number of threads is <= number of cores on your pc.
+  int num_threads; // Number of worker threads that will be evaluating cell
-  // Set this to -1 use the default value (number of cores - 1)
+                   // fitness.
-  int num_threads;     // Number of worker threads that will be evaluating cell fitness
+  int num_retries; // Number of times worker threads will try to grab work pool
-  int num_cells;       // Size of the population pool
+                   // lock before sleeping
  int batch_size;  // Number of cells a worker thread tries to evaluate in a row
                   // before locking the pool again. 1 tends to be fine
  int num_cells;   // Size of the population pool
  int num_generations; // Number of times (epochs) to run the algorithm
-  bool test_all;       // Sets whether or not every cell is tested every generation
+  bool test_all; // Sets whether or not every cell is tested every generation
-  float test_chance;   // Chance to test any given cell's fitness. Relevant only if test_all is false.
+  float test_chance; // Chance to test any given cell's fitness. Relevant only
                     // if test_all is false.
  // User defined functions
  T (*make_default_cell)();
--- a/src/genetic.cpp
+++ b/src/genetic.cpp
@@ -1,5 +1,6 @@
 #include "genetic.h"
 #include "pthread.h"
 #include <algorithm>
 #include <queue>
 #include <vector>
@@ -12,11 +13,103 @@ template <class T> struct CellEntry {
 };
 template <class T> struct WorkEntry {
-  const std::vector<CellEntry<T>> &cur;
+  const CellEntry<T> &cur;
-  std::vector<CellEntry<T>> &next;
+  float &score;
  int cur_i;
 };
 static pthread_mutex_t data_mutex = PTHREAD_MUTEX_INITIALIZER;
 static pthread_mutex_t ready_mutex = PTHREAD_MUTEX_INITIALIZER;
 static pthread_cond_t ready_cond = PTHREAD_COND_INITIALIZER;
 static pthread_mutex_t gen_complete_mutex = PTHREAD_MUTEX_INITIALIZER;
 static pthread_cond_t gen_complete_cond = PTHREAD_COND_INITIALIZER;
 static pthread_mutex_t run_complete_mutex = PTHREAD_MUTEX_INITIALIZER;
 static pthread_cond_t run_complete_cond = PTHREAD_COND_INITIALIZER;
 /*  Thoughts on this approach
 * The ideal implementation of a worker thread has them operating at maximum
 * load with as little synchronization overhead as possible. i.e. The ideal
 * worker thread
 *     1. Never waits for new work
 *     2. Never spends time synchronizing with other worker threads
 *
 * Never is impossible, but we want to get as close as we can.
 *
 * There are two extreme situations to consider
 *     1. Fitness functions with highly variable computation times
 *     2. Fitness functions with identical computation times.
 *
 * Most applications that use this library will fall into the second
 * category.
 *
 * In the highly-variable computation time case, it's useful for worker threads
 * to operate on 1 work entry at a time. Imagine a scenario with 2 threads, each
 * of which claims half the work to do. If thread A completes all of its work
 * quickly, it goes to sleep while thread B slogs away on its harder-to-compute
 * fitness jobs. However, if both threads only claim 1 work entry at a time,
 * thread A can immediately claim new jobs after it completes its current one.
 * Thread B can toil away, but little time is lost since thread A remains
 * productive.
 *
 * In the highly consistent computation time case, it's ideal for each
 * thread to claim an equal share of the jobs (as this minimizes time spent
 * synchronizing access to the job pool). Give each thread its set of work once
 * and let them have at it instead of each thread constantly locking/waiting
 * on the job queue.
 *
 * I take a hybrid approach. Users can specify a "batch size". Worker threads
 * will bite off jobs in chunks and complete them before locking
 * the job pool again. The user to choose a batch size close to 1 if
 * their fitness function compute time is highly variable, and closer to
 * num_cells / num_threads if computation time is consistent. Users should
 * experiment with a batch size that works well for their problem.
 *
 * Worth mentioning this optimization work is irrelevant once computation time
 * >>> synchronization time.
 *
 * There might be room for dynamic batch size modification, but I don't expect
 * to pursue this feature until the library is more mature (and I've run out of
 * cooler things to do).
 *
 */
 template <class T>
 void worker(std::queue<WorkEntry<T>> &fitness_queue, int batch_size,
            int num_retries) {
  int retries = 0;
  std::vector<WorkEntry<T>> batch;
  bool gen_is_finished;
  while (true) {
    gen_is_finished = false;
    if (pthread_mutex_trylock(&data_mutex)) {
      retries = 0;
      for (int i = 0; i < batch_size; i++) {
        if (fitness_queue.empty()) {
          gen_is_finished = true;
          break;
        }
        batch.push_back(fitness_queue.front());
        fitness_queue.pop();
      }
      pthread_mutex_unlock(&data_mutex);
    } else {
      retries++;
    }
    if (gen_is_finished) {
      pthread_cond_signal(&gen_complete_cond, &gen_complete_mutex);
    }
    if (retries > num_retries) {
      pthread_mutex_lock(&ready_mutex);
      pthread_cond_wait(&ready_cond, &ready_mutex);
      retries = 0;
    }
  }
  pthread_mutex_lock(&data_mutex);
 }
 // Definitions
 template <class T> Stats<T> run(Strategy<T> strat) {
  Stats<T> stats;