/* SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2017,2018 HXT-semitech Corporation. * Copyright (c) 2007-2009 Kip Macy kmacy@freebsd.org * All rights reserved. * Derived from FreeBSD's bufring.h * Used as BSD-3 Licensed with permission from Kip Macy. */ #ifndef SRC_EXT_JRING_ELEM_PRIVATE_H_ #define SRC_EXT_JRING_ELEM_PRIVATE_H_ #ifdef __cplusplus extern "C" { #endif #ifndef likely #define likely(x) __builtin_expect((x), 1) #endif #ifndef unlikely #define unlikely(x) __builtin_expect((x), 0) #endif typedef struct { union { uint64_t val[2]; }; } __attribute__((aligned(16))) j_int128_t; static inline __attribute__((always_inline)) void __jring_enqueue_elems_32( struct jring *r, const uint32_t size, uint32_t idx, const void *obj_table, uint32_t n) { unsigned int i; uint32_t *ring = (uint32_t *)&r[1]; const uint32_t *obj = (const uint32_t *)obj_table; if (likely(idx + n < size)) { for (i = 0; i < (n & ~0x7); i += 8, idx += 8) { ring[idx] = obj[i]; ring[idx + 1] = obj[i + 1]; ring[idx + 2] = obj[i + 2]; ring[idx + 3] = obj[i + 3]; ring[idx + 4] = obj[i + 4]; ring[idx + 5] = obj[i + 5]; ring[idx + 6] = obj[i + 6]; ring[idx + 7] = obj[i + 7]; } switch (n & 0x7) { case 7: ring[idx++] = obj[i++]; /* fallthrough */ case 6: ring[idx++] = obj[i++]; /* fallthrough */ case 5: ring[idx++] = obj[i++]; /* fallthrough */ case 4: ring[idx++] = obj[i++]; /* fallthrough */ case 3: ring[idx++] = obj[i++]; /* fallthrough */ case 2: ring[idx++] = obj[i++]; /* fallthrough */ case 1: ring[idx++] = obj[i++]; /* fallthrough */ } } else { for (i = 0; idx < size; i++, idx++) ring[idx] = obj[i]; /* Start at the beginning */ for (idx = 0; i < n; i++, idx++) ring[idx] = obj[i]; } } static __attribute__((always_inline)) inline void __jring_enqueue_elems_64( struct jring *r, uint32_t prod_head, const void *obj_table, uint32_t n) { unsigned int i; const uint32_t size = r->size; uint32_t idx = prod_head & r->mask; uint64_t *ring = (uint64_t *)&r[1]; const uint64_t *obj = (const uint64_t *)obj_table; if (likely(idx + n < size)) { for (i = 0; i < (n & ~0x3); i += 4, idx += 4) { ring[idx] = obj[i]; ring[idx + 1] = obj[i + 1]; ring[idx + 2] = obj[i + 2]; ring[idx + 3] = obj[i + 3]; } switch (n & 0x3) { case 3: ring[idx++] = obj[i++]; /* fallthrough */ case 2: ring[idx++] = obj[i++]; /* fallthrough */ case 1: ring[idx++] = obj[i++]; } } else { for (i = 0; idx < size; i++, idx++) ring[idx] = obj[i]; /* Start at the beginning */ for (idx = 0; i < n; i++, idx++) ring[idx] = obj[i]; } } static inline __attribute__((always_inline)) void __jring_enqueue_elems_128( struct jring *r, uint32_t prod_head, const void *obj_table, uint32_t n) { unsigned int i; const uint32_t size = r->size; uint32_t idx = prod_head & r->mask; j_int128_t *ring = (j_int128_t *)&r[1]; const j_int128_t *obj = (const j_int128_t *)obj_table; if (likely(idx + n < size)) { for (i = 0; i < (n & ~0x1); i += 2, idx += 2) memcpy((void *)(ring + idx), (const void *)(obj + i), 32); switch (n & 0x1) { case 1: memcpy((void *)(ring + idx), (const void *)(obj + i), 16); } } else { for (i = 0; idx < size; i++, idx++) memcpy((void *)(ring + idx), (const void *)(obj + i), 16); /* Start at the beginning */ for (idx = 0; i < n; i++, idx++) memcpy((void *)(ring + idx), (const void *)(obj + i), 16); } } /* the actual enqueue of elements on the ring. * Placed here since identical code needed in both * single and multi producer enqueue functions. */ static inline __attribute__((always_inline)) void __jring_enqueue_elems( struct jring *r, uint32_t prod_head, const void *obj_table, uint32_t esize, uint32_t num) { /* 8B and 16B copies implemented individually to retain * the current performance. */ if (esize == 8) { __jring_enqueue_elems_64(r, prod_head, obj_table, num); } else if (esize == 16) { __jring_enqueue_elems_128(r, prod_head, obj_table, num); } else { uint32_t idx, scale, nr_idx, nr_num, nr_size; /* Normalize to uint32_t */ scale = esize / sizeof(uint32_t); nr_num = num * scale; idx = prod_head & r->mask; nr_idx = idx * scale; nr_size = r->size * scale; __jring_enqueue_elems_32(r, nr_size, nr_idx, obj_table, nr_num); } } static inline __attribute__((always_inline)) void __jring_dequeue_elems_32( struct jring *r, const uint32_t size, uint32_t idx, void *obj_table, uint32_t n) { unsigned int i; uint32_t *ring = (uint32_t *)&r[1]; uint32_t *obj = (uint32_t *)obj_table; if (likely(idx + n < size)) { for (i = 0; i < (n & ~0x7); i += 8, idx += 8) { obj[i] = ring[idx]; obj[i + 1] = ring[idx + 1]; obj[i + 2] = ring[idx + 2]; obj[i + 3] = ring[idx + 3]; obj[i + 4] = ring[idx + 4]; obj[i + 5] = ring[idx + 5]; obj[i + 6] = ring[idx + 6]; obj[i + 7] = ring[idx + 7]; } switch (n & 0x7) { case 7: obj[i++] = ring[idx++]; /* fallthrough */ case 6: obj[i++] = ring[idx++]; /* fallthrough */ case 5: obj[i++] = ring[idx++]; /* fallthrough */ case 4: obj[i++] = ring[idx++]; /* fallthrough */ case 3: obj[i++] = ring[idx++]; /* fallthrough */ case 2: obj[i++] = ring[idx++]; /* fallthrough */ case 1: obj[i++] = ring[idx++]; /* fallthrough */ } } else { for (i = 0; idx < size; i++, idx++) obj[i] = ring[idx]; /* Start at the beginning */ for (idx = 0; i < n; i++, idx++) obj[i] = ring[idx]; } } static inline __attribute__((always_inline)) void __jring_dequeue_elems_64( struct jring *r, uint32_t prod_head, void *obj_table, uint32_t n) { unsigned int i; const uint32_t size = r->size; uint32_t idx = prod_head & r->mask; uint64_t *ring = (uint64_t *)&r[1]; uint64_t *obj = (uint64_t *)obj_table; if (likely(idx + n < size)) { for (i = 0; i < (n & ~0x3); i += 4, idx += 4) { obj[i] = ring[idx]; obj[i + 1] = ring[idx + 1]; obj[i + 2] = ring[idx + 2]; obj[i + 3] = ring[idx + 3]; } switch (n & 0x3) { case 3: obj[i++] = ring[idx++]; /* fallthrough */ case 2: obj[i++] = ring[idx++]; /* fallthrough */ case 1: obj[i++] = ring[idx++]; /* fallthrough */ } } else { for (i = 0; idx < size; i++, idx++) obj[i] = ring[idx]; /* Start at the beginning */ for (idx = 0; i < n; i++, idx++) obj[i] = ring[idx]; } } static inline __attribute__((always_inline)) void __jring_dequeue_elems_128( struct jring *r, uint32_t prod_head, void *obj_table, uint32_t n) { unsigned int i; const uint32_t size = r->size; uint32_t idx = prod_head & r->mask; j_int128_t *ring = (j_int128_t *)&r[1]; j_int128_t *obj = (j_int128_t *)obj_table; if (likely(idx + n < size)) { for (i = 0; i < (n & ~0x1); i += 2, idx += 2) memcpy((void *)(obj + i), (void *)(ring + idx), 32); switch (n & 0x1) { case 1: memcpy((void *)(obj + i), (void *)(ring + idx), 16); } } else { for (i = 0; idx < size; i++, idx++) memcpy((void *)(obj + i), (void *)(ring + idx), 16); /* Start at the beginning */ for (idx = 0; i < n; i++, idx++) memcpy((void *)(obj + i), (void *)(ring + idx), 16); } } /* the actual dequeue of elements from the ring. * Placed here since identical code needed in both * single and multi producer enqueue functions. */ static inline __attribute__((always_inline)) void __jring_dequeue_elems( struct jring *r, uint32_t cons_head, void *obj_table, uint32_t esize, uint32_t num) { /* 8B and 16B copies implemented individually to retain * the current performance. */ if (esize == 8) { __jring_dequeue_elems_64(r, cons_head, obj_table, num); } else if (esize == 16) { __jring_dequeue_elems_128(r, cons_head, obj_table, num); } else { uint32_t idx, scale, nr_idx, nr_num, nr_size; /* Normalize to uint32_t */ scale = esize / sizeof(uint32_t); nr_num = num * scale; idx = cons_head & r->mask; nr_idx = idx * scale; nr_size = r->size * scale; __jring_dequeue_elems_32(r, nr_size, nr_idx, obj_table, nr_num); } } static inline __attribute__((always_inline)) void __jring_wait_until_equal_32( volatile uint32_t *addr, uint32_t expected, int memorder) { // assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED); while (__atomic_load_n(addr, memorder) != expected) __asm__("pause;"); } static inline __attribute__((always_inline)) void __jring_update_tail( struct jring_headtail *ht, uint32_t old_val, uint32_t new_val, uint32_t single, __attribute__((unused)) uint32_t enqueue) { /* * If there are other enqueues/dequeues in progress that preceded us, * we need to wait for them to complete */ if (!single) __jring_wait_until_equal_32(&ht->tail, old_val, __ATOMIC_RELAXED); __atomic_store_n(&ht->tail, new_val, __ATOMIC_RELEASE); } /** * @internal This function updates the producer head for enqueue * * @param r * A pointer to the ring structure * @param is_sp * Indicates whether multi-producer path is needed or not * @param n * The number of elements we will want to enqueue, i.e. how far should the * head be moved * @param behavior * JRING_QUEUE_FIXED: Enqueue a fixed number of items from a ring * JRING_QUEUE_VARIABLE: Enqueue as many items as possible from ring * @param old_head * Returns head value as it was before the move, i.e. where enqueue starts * @param new_head * Returns the current/new head value i.e. where enqueue finishes * @param free_entries * Returns the amount of free space in the ring BEFORE head was moved * @return * Actual number of objects enqueued. * If behavior == JRING_QUEUE_FIXED, this will be 0 or n only. */ static inline __attribute__((always_inline)) unsigned int __jring_move_prod_head(struct jring *r, unsigned int is_sp, unsigned int n, enum jring_queue_behavior behavior, uint32_t *old_head, uint32_t *new_head, uint32_t *free_entries) { const uint32_t capacity = r->capacity; uint32_t cons_tail; unsigned int max = n; int success; *old_head = __atomic_load_n(&r->prod.head, __ATOMIC_RELAXED); do { /* Reset n to the initial burst count */ n = max; /* Ensure the head is read before tail */ __atomic_thread_fence(__ATOMIC_ACQUIRE); /* load-acquire synchronize with store-release of ht->tail * in update_tail. */ cons_tail = __atomic_load_n(&r->cons.tail, __ATOMIC_ACQUIRE); /* The subtraction is done between two unsigned 32bits value * (the result is always modulo 32 bits even if we have * *old_head > cons_tail). So 'free_entries' is always between 0 * and capacity (which is < size). */ *free_entries = (capacity + cons_tail - *old_head); /* check that we have enough room in ring */ if (unlikely(n > *free_entries)) n = (behavior == JRING_QUEUE_FIXED) ? 0 : *free_entries; if (n == 0) return 0; *new_head = *old_head + n; if (is_sp) r->prod.head = *new_head, success = 1; else /* on failure, *old_head is updated */ success = __atomic_compare_exchange_n(&r->prod.head, old_head, *new_head, 0, __ATOMIC_RELAXED, __ATOMIC_RELAXED); } while (unlikely(success == 0)); return n; } /** * @internal This function updates the consumer head for dequeue * * @param r * A pointer to the ring structure * @param is_sc * Indicates whether multi-consumer path is needed or not * @param n * The number of elements we will want to dequeue, i.e. how far should the * head be moved * @param behavior * JRING_QUEUE_FIXED: Dequeue a fixed number of items from a ring * JRING_QUEUE_VARIABLE: Dequeue as many items as possible from ring * @param old_head * Returns head value as it was before the move, i.e. where dequeue starts * @param new_head * Returns the current/new head value i.e. where dequeue finishes * @param entries * Returns the number of entries in the ring BEFORE head was moved * @return * - Actual number of objects dequeued. * If behavior == JRING_QUEUE_FIXED, this will be 0 or n only. */ static inline __attribute__((always_inline)) unsigned int __jring_move_cons_head(struct jring *r, int is_sc, unsigned int n, enum jring_queue_behavior behavior, uint32_t *old_head, uint32_t *new_head, uint32_t *entries) { unsigned int max = n; uint32_t prod_tail; int success; /* move cons.head atomically */ *old_head = __atomic_load_n(&r->cons.head, __ATOMIC_RELAXED); do { /* Restore n as it may change every loop */ n = max; /* Ensure the head is read before tail */ __atomic_thread_fence(__ATOMIC_ACQUIRE); /* this load-acquire synchronize with store-release of ht->tail * in update_tail. */ prod_tail = __atomic_load_n(&r->prod.tail, __ATOMIC_ACQUIRE); /* The subtraction is done between two unsigned 32bits value * (the result is always modulo 32 bits even if we have * cons_head > prod_tail). So 'entries' is always between 0 * and size(ring)-1. */ *entries = (prod_tail - *old_head); /* Set the actual entries for dequeue */ if (n > *entries) n = (behavior == JRING_QUEUE_FIXED) ? 0 : *entries; if (unlikely(n == 0)) return 0; *new_head = *old_head + n; if (is_sc) r->cons.head = *new_head, success = 1; else /* on failure, *old_head will be updated */ success = __atomic_compare_exchange_n(&r->cons.head, old_head, *new_head, 0, __ATOMIC_RELAXED, __ATOMIC_RELAXED); } while (unlikely(success == 0)); return n; } /** * @internal Enqueue several objects on the ring * * @param r * A pointer to the ring structure. * @param obj_table * A pointer to a table of objects. * @param esize * The size of ring element, in bytes. It must be a multiple of 4. * This must be the same value used while creating the ring. Otherwise * the results are undefined. * @param n * The number of objects to add in the ring from the obj_table. * @param behavior * jRING_QUEUE_FIXED: Enqueue a fixed number of items from a ring * jRING_QUEUE_VARIABLE: Enqueue as many items as possible from ring * @param is_sp * Indicates whether to use single producer or multi-producer head update * @param free_space * returns the amount of space after the enqueue operation has finished * @return * Actual number of objects enqueued. * If behavior == jRING_QUEUE_FIXED, this will be 0 or n only. */ static inline __attribute__((always_inline)) unsigned int __jring_do_enqueue_elem(struct jring *r, const void *obj_table, unsigned int esize, unsigned int n, enum jring_queue_behavior behavior, unsigned int is_sp, unsigned int *free_space) { uint32_t prod_head, prod_next; uint32_t free_entries; n = __jring_move_prod_head(r, is_sp, n, behavior, &prod_head, &prod_next, &free_entries); if (n == 0) goto end; __jring_enqueue_elems(r, prod_head, obj_table, esize, n); __jring_update_tail(&r->prod, prod_head, prod_next, is_sp, 1); end: if (free_space != NULL) *free_space = free_entries - n; return n; } /** * @internal Dequeue several objects from the ring * * @param r * A pointer to the ring structure. * @param obj_table * A pointer to a table of objects. * @param esize * The size of ring element, in bytes. It must be a multiple of 4. * This must be the same value used while creating the ring. Otherwise * the results are undefined. * @param n * The number of objects to pull from the ring. * @param behavior * jRING_QUEUE_FIXED: Dequeue a fixed number of items from a ring * jRING_QUEUE_VARIABLE: Dequeue as many items as possible from ring * @param is_sc * Indicates whether to use single consumer or multi-consumer head update * @param available * returns the number of remaining ring entries after the dequeue has finished * @return * - Actual number of objects dequeued. * If behavior == jRING_QUEUE_FIXED, this will be 0 or n only. */ static inline __attribute__((always_inline)) unsigned int __jring_do_dequeue_elem(struct jring *r, void *obj_table, unsigned int esize, unsigned int n, enum jring_queue_behavior behavior, unsigned int is_sc, unsigned int *available) { uint32_t cons_head, cons_next; uint32_t entries; n = __jring_move_cons_head(r, (int)is_sc, n, behavior, &cons_head, &cons_next, &entries); if (n == 0) goto end; __jring_dequeue_elems(r, cons_head, obj_table, esize, n); __jring_update_tail(&r->cons, cons_head, cons_next, is_sc, 0); end: if (available != NULL) *available = entries - n; return n; } #ifdef __cplusplus } #endif #endif // SRC_EXT_JRING_ELEM_PRIVATE_H_