Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
380 changes: 380 additions & 0 deletions opencog/opencl/atomspace/gpu-cosine.cl
Original file line number Diff line number Diff line change
@@ -0,0 +1,380 @@
/*
* gpu-cosine.cl -- Cosine similarity and candidate generation on GPU
*
* Computes pairwise cosine similarity between words based on their
* section (disjunct) vectors, entirely on GPU-resident data.
*
* Pipeline:
* 1. compute_word_norms — ||v||² for each word
* 2. build_disjunct_chains — reverse index: djh → linked list of sections
* 3. accumulate_dot_products — walk chains, accumulate dot(w1,w2)
* 4. compute_cosines — cosine = dot / (||v1|| × ||v2||)
* 5. filter_candidates — compact output above threshold
*
* No CPU↔GPU data transfer between stages.
*
* Requires capacity macros (set via -D flags):
* DJH_HT_CAPACITY — disjunct reverse index hash table size (power of 2)
* CANDIDATE_CAPACITY — max candidate pairs
* CANDIDATE_HT_CAPACITY — candidate pair hash table size (power of 2)
*
* Appended after gpu-hashtable.cl, gpu-atomspace.cl at load time.
* (gpu-sections.cl optional — only needed for full pipeline tests)
*/

/* ═══════════════════════════════════════════════════════════════
* STEP 1: COMPUTE WORD NORMS
*
* One thread per section. Atomically adds count² to the word's
* norm accumulator. After all sections processed:
* word_norm_sq[w] = Σ_d count(w,d)²
*
* where d ranges over all disjuncts for word w.
* ═══════════════════════════════════════════════════════════════ */

__kernel void compute_word_norms(
__global const uint* sec_word,
__global const double* sec_count,
__global volatile double* word_norm_sq,
const uint num_sections)
{
uint tid = get_global_id(0);
if (tid >= num_sections) return;

double count = sec_count[tid];
if (count < 0.5) return;

uint word = sec_word[tid];
atomic_add_double(&word_norm_sq[word], count * count);
}

/* ═══════════════════════════════════════════════════════════════
* STEP 2: BUILD DISJUNCT REVERSE INDEX (linked list chains)
*
* For each unique disjunct_hash, build a linked list of all
* sections that share that disjunct. Uses lock-free atomic
* prepend to a per-disjunct chain.
*
* Data structures:
* djh_ht_keys[DJH_HT_CAPACITY] — disjunct hash values
* djh_ht_values[DJH_HT_CAPACITY] — chain head (section index)
* sec_chain_next[SECTION_CAPACITY] — per-section next pointer
*
* Chain traversal: start at djh_ht_values[slot], follow
* sec_chain_next[] until HT_EMPTY_VALUE (end of chain).
* ═══════════════════════════════════════════════════════════════ */

__kernel void build_disjunct_chains(
__global const ulong* sec_disjunct_hash,
__global const double* sec_count,
__global volatile ulong* djh_ht_keys,
__global volatile uint* djh_ht_values,
__global uint* sec_chain_next,
const uint num_sections)
{
uint tid = get_global_id(0);
if (tid >= num_sections) return;

/* Skip empty sections */
if (sec_count[tid] < 0.5) {
sec_chain_next[tid] = HT_EMPTY_VALUE;
return;
}

ulong djh = sec_disjunct_hash[tid];
if (djh == HT_EMPTY_KEY) {
sec_chain_next[tid] = HT_EMPTY_VALUE;
return;
}

ulong cap = DJH_HT_CAPACITY;
ulong mask = cap - 1;
ulong slot = ht_hash(djh) & mask;

for (uint probe = 0; probe < HT_MAX_PROBES; probe++)
{
ulong prev = atom_cmpxchg(&djh_ht_keys[slot], HT_EMPTY_KEY, djh);

if (prev == HT_EMPTY_KEY || prev == djh)
{
/* Atomically prepend this section to the chain.
* atomic_xchg returns the old head (initially HT_EMPTY_VALUE
* for a fresh slot, which serves as the end-of-chain sentinel).
* Lock-free: concurrent prepends produce a valid chain. */
uint old_head = atomic_xchg(&djh_ht_values[slot], tid);
sec_chain_next[tid] = old_head;
return;
}

slot = (slot + 1) & mask;
}

/* Hash table full — orphan this section */
sec_chain_next[tid] = HT_EMPTY_VALUE;
}

/* ═══════════════════════════════════════════════════════════════
* STEP 3: ACCUMULATE DOT PRODUCTS
*
* One thread per section. For each section (word_i, djh_i, count_i):
* - Look up djh_i in the disjunct reverse index
* - Pre-count chain length; skip if > MAX_CHAIN_LEN
* (very common disjuncts are uninformative, like stopwords)
* - Walk the chain of sections sharing this disjunct
* - For each other section (word_j, djh_i, count_j) where word_j != word_i:
* - Find-or-create candidate pair (word_i, word_j)
* - Atomically add count_i × count_j to the candidate's dot product
*
* Uses canonical ordering (word_i < word_j) to avoid double-counting:
* each shared disjunct contributes exactly once to each pair's dot product.
*
* Candidate pairs are stored in a hash table + pool (same pattern
* as atom pools from gpu-atomspace.cl).
* ═══════════════════════════════════════════════════════════════ */

#ifndef MAX_CHAIN_LEN
#define MAX_CHAIN_LEN 200
#endif

__kernel void accumulate_dot_products(
/* Section data */
__global const uint* sec_word,
__global const ulong* sec_disjunct_hash,
__global const double* sec_count,
/* Disjunct reverse index */
__global const ulong* djh_ht_keys,
__global const uint* djh_ht_values,
__global const uint* sec_chain_next,
/* Candidate hash table */
__global volatile ulong* cand_ht_keys,
__global volatile uint* cand_ht_values,
/* Candidate pool SoA */
__global uint* cand_word_a,
__global uint* cand_word_b,
__global volatile double* cand_dot,
__global volatile uint* cand_next_free,
/* Size */
const uint num_sections)
{
uint tid = get_global_id(0);
if (tid >= num_sections) return;

uint my_word = sec_word[tid];
double my_count = sec_count[tid];
if (my_count < 0.5) return;

ulong my_djh = sec_disjunct_hash[tid];
if (my_djh == HT_EMPTY_KEY) return;

/* ── Look up disjunct in reverse index ── */

ulong cap = DJH_HT_CAPACITY;
ulong mask = cap - 1;
ulong slot = ht_hash(my_djh) & mask;
uint chain_head = HT_EMPTY_VALUE;

for (uint probe = 0; probe < HT_MAX_PROBES; probe++)
{
ulong k = djh_ht_keys[slot];
if (k == my_djh) {
chain_head = djh_ht_values[slot];
break;
}
if (k == HT_EMPTY_KEY) break;
slot = (slot + 1) & mask;
}

if (chain_head == HT_EMPTY_VALUE) return;

/* ── Pre-count chain length — skip overly common disjuncts ── */
/* Disjuncts appearing in > MAX_CHAIN_LEN sections are too common
* to be discriminative (like stopwords in information retrieval).
* Skipping them entirely avoids O(K²) explosion for popular chains
* and matches the CPU reverse-index filter. */

uint chain_len = 0;
uint pre = chain_head;
while (pre != HT_EMPTY_VALUE && chain_len <= MAX_CHAIN_LEN)
{
chain_len++;
pre = sec_chain_next[pre];
}
if (chain_len > MAX_CHAIN_LEN) return;

/* ── Walk chain, accumulate dot products ── */

uint cur = chain_head;

while (cur != HT_EMPTY_VALUE)
{
uint other_word = sec_word[cur];
double other_count = sec_count[cur];

/* Only accumulate once per pair (canonical: my_word < other_word) */
if (other_word != my_word && my_word < other_word && other_count >= 0.5)
{
/* ── Find or create candidate pair ── */

uint lo = my_word; /* already < other_word */
uint hi = other_word;
ulong cand_key = ((ulong)lo << 32) | (ulong)hi;

ulong c_cap = CANDIDATE_HT_CAPACITY;
ulong c_mask = c_cap - 1;
ulong c_slot = ht_hash(cand_key) & c_mask;
uint cand_idx = HT_EMPTY_VALUE;

for (uint p = 0; p < HT_MAX_PROBES; p++)
{
ulong prev = atom_cmpxchg(
&cand_ht_keys[c_slot], HT_EMPTY_KEY, cand_key);

if (prev == HT_EMPTY_KEY)
{
/* New candidate — allocate from pool */
uint idx = atomic_add(cand_next_free, 1U);
if (idx < CANDIDATE_CAPACITY) {
cand_word_a[idx] = lo;
cand_word_b[idx] = hi;
cand_dot[idx] = 0.0;
mem_fence(CLK_GLOBAL_MEM_FENCE);
cand_ht_values[c_slot] = idx;
cand_idx = idx;
}
break;
}
if (prev == cand_key)
{
/* Existing candidate — spin for value */
uint val = cand_ht_values[c_slot];
while (val == HT_EMPTY_VALUE) {
val = cand_ht_values[c_slot];
}
cand_idx = val;
break;
}

c_slot = (c_slot + 1) & c_mask;
}

if (cand_idx != HT_EMPTY_VALUE)
{
atomic_add_double(&cand_dot[cand_idx],
my_count * other_count);
}
}

cur = sec_chain_next[cur];
}
}

/* ═══════════════════════════════════════════════════════════════
* STEP 4: COMPUTE COSINES
*
* One thread per candidate. Converts accumulated dot products
* to cosine similarity using pre-computed word norms.
*
* cosine(w1, w2) = dot(w1, w2) / (||w1|| × ||w2||)
* ═══════════════════════════════════════════════════════════════ */

/* MIN_NORM_SQ filters rare words: a word with N sections each count=1
* has norm_sq = N. Setting MIN_NORM_SQ = 5.0 requires >= 5 section types.
* This prevents hapax legomena from getting cosine=1.0 by chance. */
#ifndef MIN_NORM_SQ
#define MIN_NORM_SQ 50.0
#endif

__kernel void compute_cosines(
__global const uint* cand_word_a,
__global const uint* cand_word_b,
__global const double* cand_dot,
__global double* cand_cosine,
__global const double* word_norm_sq,
const uint num_candidates)
{
uint tid = get_global_id(0);
if (tid >= num_candidates) return;

uint wa = cand_word_a[tid];
uint wb = cand_word_b[tid];
double dot = cand_dot[tid];

double na = word_norm_sq[wa];
double nb = word_norm_sq[wb];

/* Skip rare words — not enough data for meaningful cosine */
if (na < MIN_NORM_SQ || nb < MIN_NORM_SQ) {
cand_cosine[tid] = 0.0;
return;
}

double denom = sqrt(na) * sqrt(nb);
double cos_val = (denom > 1e-10) ? (dot / denom) : 0.0;

/* Clamp to valid range — prevents transient GPU state issues */
if (cos_val > 1.0) cos_val = 1.0;
if (cos_val < -1.0) cos_val = -1.0;

cand_cosine[tid] = cos_val;
}

/* ═══════════════════════════════════════════════════════════════
* STEP 5: FILTER CANDIDATES
*
* Compact candidates above a cosine threshold into contiguous
* output arrays. Returns (word_a, word_b, cosine) for each
* passing pair.
*
* out_count must be initialized to 0 before kernel launch.
* ═══════════════════════════════════════════════════════════════ */

__kernel void filter_candidates(
__global const uint* cand_word_a,
__global const uint* cand_word_b,
__global const double* cand_cosine,
const uint num_candidates,
const double threshold,
__global uint* out_word_a,
__global uint* out_word_b,
__global double* out_cosine,
__global volatile uint* out_count,
const uint max_output)
{
uint tid = get_global_id(0);
if (tid >= num_candidates) return;

double cos_val = cand_cosine[tid];
if (cos_val > threshold)
{
uint idx = atomic_add(out_count, 1U);
if (idx < max_output) {
out_word_a[idx] = cand_word_a[tid];
out_word_b[idx] = cand_word_b[tid];
out_cosine[idx] = cos_val;
}
}
}

/* ═══════════════════════════════════════════════════════════════
* READBACK: Dump candidate data for verification
* ═══════════════════════════════════════════════════════════════ */

__kernel void read_candidates(
__global const uint* cand_word_a,
__global const uint* cand_word_b,
__global const double* cand_dot,
__global const double* cand_cosine,
__global uint* out_word_a,
__global uint* out_word_b,
__global double* out_dot,
__global double* out_cosine,
const uint num_candidates)
{
uint tid = get_global_id(0);
if (tid >= num_candidates) return;

out_word_a[tid] = cand_word_a[tid];
out_word_b[tid] = cand_word_b[tid];
out_dot[tid] = cand_dot[tid];
out_cosine[tid] = cand_cosine[tid];
}
Loading