# The location where we store information about the dynamic heap
const.HEAP_INFO_ADDR=0x80000000 # (address in words)
# The address beyond which the dynamic heap cannot be allowed to grow
const.HEAP_END=0x10000000 # 2^30 / 4 (i.e. byte address, not word)
# The assertion error code used when intrinsics are used without calling heap_init
const.HEAP_ERR=0x68656170 # b"heap"
const.NEG1=4294967295 # u32::MAX
# The magic bytes used to verify that the heap was properly initialized
const.MAGIC=0xDEADBEEF
const.PAGE_SIZE=65536

# Checks the HEAP_INFO magic to ensure heap initialization has taken place
#
# This consumes the input element.
proc.verify_heap_magic # [input]
    u32assert.err=HEAP_ERR
    push.MAGIC
    assert_eq.err=HEAP_ERR
end

# Intrinsic used to initialize the heap globals manipulated by memory intrinsics
#
# This must be called before any other heap intrinsics are called. This is checked
# by each intrinsic
export.heap_init # [heap_base]
    dup.0 push.0 swap.1 push.MAGIC # [MAGIC, heap_base, heap_size, heap_top]
    mem_storew.HEAP_INFO_ADDR
    dropw
end

# Get the (byte) address where the base of the heap starts
export.heap_base
    padw mem_loadw.HEAP_INFO_ADDR
    exec.verify_heap_magic movdn.2 drop drop
end

# Get the (byte) address of the top of the heap
export.heap_top_unchecked
    mem_load.HEAP_INFO_ADDR
end

# Get the (byte) address of the top of the heap
export.heap_top
    padw mem_loadw.HEAP_INFO_ADDR
    exec.verify_heap_magic drop drop
end

# Intrinsic corresponding to the `memory_size` instruction
export.memory_size
    padw mem_loadw.HEAP_INFO_ADDR
    exec.verify_heap_magic drop swap.1 drop
end

# Intrinsic corresponding to the `memory_grow` instruction
export.memory_grow # [num_pages]
    padw mem_loadw.HEAP_INFO_ADDR # [MAGIC, heap_base, heap_size, heap_top, num_pages]
    dup.0 exec.verify_heap_magic  # [MAGIC, heap_base, heap_size, heap_top, num_pages]
    swap.3 drop   # [heap_base, heap_size, MAGIC, num_pages]
    dup.1 movdn.4 # [heap_base, heap_size, MAGIC, num_pages, heap_size]
    swap.1        # [heap_size, heap_base, MAGIC, num_pages, heap_size]
    movup.3       # [num_pages, heap_size, heap_base, MAGIC, heap_size]
    u32overflowing_add # [overflowed, heap_size + num_pages, heap_base, MAGIC, heap_size]
    if.true # [new_heap_size, heap_base, MAGIC, heap_size]
        # Cannot grow the memory, return -1
        dropw # []
        push.NEG1
    else
        # Success, recompute the heap_top, and make sure it doesn't exceed HEAP_END
        dup.0          # [new_heap_size, new_heap_size, heap_base, MAGIC, heap_size]
        push.PAGE_SIZE # [PAGE_SIZE, new_heap_size, new_heap_size, heap_base, MAGIC, heap_size]
        dup.3          # [heap_base, PAGE_SIZE, new_heap_size, new_heap_size, heap_base, MAGIC, heap_size]
        movdn.2        # [PAGE_SIZE, new_heap_size, heap_base, ..]
        u32overflowing_madd # [overflow, PAGE_SIZE * new_heap_size + heap_base, ..]
        if.true        # [new_heap_top, new_heap_size, heap_base, MAGIC, heap_size]
          # Overflow, drop the changes and return -1
          dropw drop
          push.NEG1
        else
          # Ensure the new heap_top is <= HEAP_END
          dup.0 u32lte.HEAP_END
          if.true
            # Write updated heap information, and return the old heap size (in pages)
            swap.2  # [heap_base, new_heap_size, new_heap_top, MAGIC, heap_size]
            movup.3 # [MAGIC, heap_base, new_heap_size, new_heap_top, heap_size]
            mem_storew.HEAP_INFO_ADDR
            dropw
          else
            # Overflow, drop the changes and return -1
            dropw drop
            push.NEG1
          end
        end
    end
end

# Given an element index, and a word, in that order, drop the elements of the
# word other than the at the specified index.
#
# The element index must be in the range 0..=3.
export.extract_element # [element_index, w3, w2, w1, w0]
    # assert the index given is valid
    dup.0 push.3 lte assert
    # compute a set of three booleans which used in conjunction with cdrop will
    # extract the desired element of the given word
    dup.0 push.1 gte movdn.5   # [element_index, w3, ..w0, element_index >= 1]
    dup.0 push.2 gte movdn.5   # [element_index, w3, ..w0, element_index >= 2, ..]
    push.3 eq                  # [element_index == 3, w3, ..w0, ..]

    # if element index == 3, drop w2, else drop w3
    cdrop
    # if element index >= 2, drop w1, else drop w2+
    movup.3 cdrop
    # if element index >= 1, drop w0, else drop w1+
    #
    # after this point, the only value on the operand stack remaining will be
    # the element of the word indicated by the index that was on the top of the
    # stack on entry. We've consumed the word itself, as well as the element
    # index
    movup.2 cdrop
end

# See `load_felt` for safe usage
proc.load_felt_unchecked # [waddr, index]
    # prepare the stack to receive the loaded word
    # [waddr, 0, 0, 0, 0, index]
    padw movup.4
    # load the word which contains the desired element
    mem_loadw  # [w3, w2, w1, w0, index]

    # select the desired element
    movup.4
    exec.extract_element
end

# Load a field element from the given native pointer triplet.
#
# A native pointer triplet consists of a word address which contains the
# start of the data; an element index, which indicates which element of
# the word the data starts in; and a byte offset, which indicates which
# byte is the start of the data.
#
# A field element must be naturally aligned, i.e. it's byte offset must be zero.
export.load_felt # [waddr, index, offset]
    # assert the pointer is felt-aligned, then load
    movup.2 assertz exec.load_felt_unchecked
end

# Load a single 32-bit machine word from the given native pointer triplet.
#
# A native pointer triplet consists of a word address which contains the
# start of the data; an element index, which indicates which element of
# the word the data starts in; and a byte offset, which indicates which
# byte is the start of the data.
export.load_sw # [waddr, index, offset]
    # check for alignment and offset validity
    dup.2 eq.0
    dup.3 push.8 u32lt assert # offset must be < 8
    # if the pointer is naturally aligned..
    if.true
        # drop the byte offset
        movup.2 drop
        # load the element containing the data we want
        exec.load_felt_unchecked
    else
        # check if the load starts in the first element
        dup.1 eq.0
        if.true
            # the load is across both the first and second elements
            # drop the element index
            swap.1 drop
            # load
            padw movup.4 mem_loadw # [w3, w2, w1, w0, offset]
            # drop the unused elements
            drop drop
            # shift low bits
            push.32 dup.3 # [offset, 32, w1, w0, offset]
            u32overflowing_sub assertz # [32 - offset, w1, w0, offset]
            u32shr        # [lo, w0, offset]
            # shift high bits left by the offset
            swap.2        # [offset, w0, lo]
            u32shl        # [hi, lo]
            # combine the two halves
            u32or         # [result]
        else
            # check if the load starts in the second element
            dup.1 eq.1
            if.true
                # the load is across both the second and third elements
                # drop the element idnex
                swap.1 drop
                # load
                padw movup.4 mem_loadw # [w3, w2, w1, w0, offset]
                # drop the unused elements
                drop movup.2 drop      # [w2, w1, offset]
                # shift the low bits
                push.32 dup.3          # [offset, 32, w2, w1, offset]
                u32overflowing_sub assertz # [32 - offset, w2, w1, offset]
                u32shr                 # [lo, w1, offset]
                # shift high bits left by the offset
                swap.2                 # [offset, w1, lo]
                u32shl                 # [hi, lo]
                # combine the two halves
                u32or                  # [result]
            else
                # check if the load starts in the third element
                swap.1 eq.2
                if.true
                    # the load is across both the third and fourth elements
                    padw movup.4 mem_loadw    # [w3, w2, w1, w0, offset]
                    # drop the unused elements
                    movup.3 movup.3 drop drop # [w3, w2, offset]
                    # shift the low bits
                    push.32 dup.3             # [offset, 32, w3, w2, offset]
                    u32overflowing_sub assertz # [32 - offset, w3, w2, offset]
                    u32shr                    # [lo, w2, offset]
                    # shift the high bits left by the offset
                    swap.2                    # [offset, w2, lo]
                    u32shl                    # [hi, lo]
                    # combine the two halves
                    u32or  # [result]
                else
                    # the load crosses a word boundary
                    # start with the word containing the low bits
                    dup.0                        # [waddr, waddr, offset]
                    u32overflowing_add.1 assertz # [waddr + 1, waddr, offset]
                    # load the low bits
                    mem_load                   # [w0, waddr, offset]
                    # shift the low bits
                    push.32 dup.3              # [offset, 32, w0, waddr, offset]
                    u32overflowing_sub assertz # [32 - offset, w0, waddr, offset]
                    u32shr                     # [lo, waddr, offset]
                    # load the word with the high bits, drop unused elements
                    swap.1 padw movup.4 mem_loadw movdn.3 drop drop drop # [w3, lo, offset]
                    # shift high bits
                    movup.2 u32shl             # [hi, lo]
                    # combine the two halves
                    u32or                      # [result]
                end
            end
        end
    end
end

# This handles emitting code that handles aligning an unaligned double
# machine-word value which is split across three machine words (field elements).
#
# To recap:
#
# * A machine word is a 32-bit chunk stored in a single field element
# * A double word is a pair of 32-bit chunks
# * A quad word is a quartet of 32-bit chunks (i.e. a Miden "word")
# * An unaligned double-word requires three 32-bit chunks to represent,
# since the first chunk does not contain a full 32-bits, so an extra is
# needed to hold those bits.
#
# As an example, assume the pointer we are dereferencing is a u64 value,
# which has 8-byte alignment, and the value is stored 40 bytes from the
# nearest quad-word-aligned boundary. To load the value, we must fetch
# the full quad-word from the aligned address, drop the first word, as
# it is unused, and then recombine the 64 bits we need spread across
# the remaining three words to obtain the double-word value we actually want.
#
# The data, on the stack, is shown below:
#
# If we visualize which bytes are contained in each 32-bit chunk on the stack,
# when loaded by `mem_loadw`, we get:
#
#     [<unused>, 9..=12, 5..=8, 0..=4]
#
# These byte indices are relative to the nearest word-aligned address, in the
# same order as they would occur in a byte-addressable address space. The
# significance of each byte depends on the value being dereferenced, but Miden
# is a little-endian machine, so typically the most significant bytes come first
# (i.e. also commonly referred to as "high" vs "low" bits).
#
# If we visualize the layout of the bits of our u64 value spread across the
# three chunks, we get:
#
#     [<unused>, 00000000111111111111111111111111, 111111111111111111111111111111, 11111111111111111111111100000000]
#
# As illustrated above, what should be a double-word value is occupying three words.
# To "realign" the value, i.e. ensure that it is naturally aligned and fits in two
# words, we have to perform a sequence of shifts and masks to get the bits where
# they belong. This function performs those steps, with the assumption that the caller
# has three values on the operand stack representing any unaligned double-word value
export.realign_dw # [chunk_hi, chunk_mid, chunk_lo, offset]
    # We will refer to the parts of our desired double-word value
    # as two parts, `x_hi` and `x_lo`.
    # Re-align the high bits by shifting out the offset
    #
    # This gives us the first half of the first word.
    dup.3 u32shl # [x_hi_hi, chunk_mid, chunk__lo, offset]

    # Move the value below the other chunks temporarily
    movdn.3 # [chunk_mid, chunk_lo, offset, x_hi_hi]

    # We must split the middle chunk into two parts,
    # one containing the bits to be combined with the
    # first machine word; the other to be combined with
    # the second machine word.
    #
    # First, we duplicate the chunk, since we need two
    # copies of it:
    #
    dup.0 # [chunk_mid, chunk_mid, chunk_lo, offset, x_hi_hi]

    # Then, we shift the chunk right by 32 - offset bits,
    # re-aligning the low bits of the first word, and
    # isolating them.
    push.32 dup.4 u32wrapping_sub u32shr  # [x_hi_lo, chunk_mid, chunk_lo, offset, x_hi_hi]

    # Move the high bits back to the top
    #
    # [x_hi_hi, x_hi_lo, chunk_mid, chunk_lo]
    movup.4 # [x_hi_hi, x_hi_lo, chunk_mid, chunk_lo, offset]

    # OR the two parts of the `x_hi` chunk together
    u32or # [x_hi, chunk_mid, chunk_lo, offset]

    # Move `x_hi` to the bottom for later
    movdn.2 # [chunk_mid, chunk_lo, x_hi, offset]

    # Now, we need to re-align the high bits of the second word
    # by shifting the remaining copy of the middle chunk, similar
    # to what we did at the very beginning.
    #
    # This gives us the first half of the second word.
    #
    # [x_lo_hi, chunk_lo, x_hi]
    dup.3 u32shl # [x_lo_hi, chunk_lo, x_hi, offset]

    # Next, swap the low bit chunk to the top temporarily
    swap.1

    # Shift the value right, as done previously for the middle chunk
    push.32 movup.4 u32wrapping_sub u32shr # [x_lo_lo, x_lo_hi, x_hi]

    # OR the two halves together, giving us our second word, `x_lo`
    u32or # [x_lo, x_hi]

    # Swap the words so they are in the correct order
    swap.1 # [x_hi, x_lo]
end

# Shift a double-word (64-bit, in two 32-bit chunks) value by the given offset
# Returns three 32-bit chunks [chunk_lo, chunk_mid, chunk_hi]
export.offset_dw # [value_hi, value_lo, offset]
    dup.0
    dup.3 u32shr # [chunk_hi, value_hi, value_lo, offset]
    movdn.3      # [value_hi, value_lo, offset, chunk_hi]
    push.32 dup.3 u32wrapping_sub # [32 - offset, value_hi, value_lo, offset, chunk_hi]
    u32shl       # [ chunk_mid_hi, value_lo, offset, chunk_hi]
    dup.1        # [ value_lo, chunk_mid_hi, value_lo, offset, chunk_hi]
    dup.3        # [ offset, value_lo, chunk_mid_hi, value_lo, offset, chunk_hi]
    u32shr       # [ chunk_mid_lo, chunk_mid_hi, value_lo, offset, chunk_hi]
    u32or        # [ chunk_mid, value_lo, offset, chunk_hi]
    movdn.2      # [ value_lo, offset, chunk_mid, chunk_hi]
    push.32 movup.2 u32wrapping_sub # [32 - offset, value_lo, offset, chunk_mid, chunk_hi]
    u32shl       # [ chunk_lo, chunk_mid, chunk_hi]
end

# Load a pair of machine words (32-bit elements) to the operand stack
export.load_dw # [waddr, index, offset]
    # check for alignment and offset validity
    dup.2 eq.0
    dup.3 push.8 u32lt assert # offset must be < 8
    # convert offset from bytes to bits
    movup.3 push.8 u32wrapping_mul movdn.3 # [waddr, index, offset, value_hi, value_lo]
    # if the pointer is naturally aligned..
    if.true
        # drop byte offset
        movup.2 drop # [waddr, index]
        # check which element to start at
        dup.1 eq.0
        if.true
            # drop index
            swap.1 drop # [waddr]
            # load first two elements
            padw movup.4 mem_loadw # [w3, w2, w1, w0]
            # drop last two elements, and we're done
            drop drop swap.1       # [w0, w1]
        else
          dup.1 eq.1
          if.true
              # drop index
              swap.1 drop # [waddr]
              # load second and third elements
              padw movup.4 mem_loadw   # [w3, w2, w1, w0]
              # drop unused elements, and we're done
              movup.3 drop drop swap.1 # [w1, w2]
          else
              swap.1 eq.2
              if.true
                # load third and fourth elements, drop unused, and we're done
                padw movup.4 mem_loadw           # [w3, w2, w1, w0]
                movup.3 movup.3 drop drop swap.1 # [w2, w3]
              else
                # load first element of next word
                dup.0 u32overflowing_add.1 assertz # [waddr + 1, waddr]
                mem_load                           # [w0, waddr]
                # load fourth element, and we're done
                swap.1 padw movup.4 mem_loadw      # [w3, w2, w1, w0, lo]
                movdn.3 drop drop drop             # [hi, lo]
              end
          end
        end
    else # unaligned; an unaligned double-word spans three elements
        # check if we start in the first element
        dup.1 eq.0
        if.true
            # memory layout: [<unused>, lo, mid, hi]
            # drop the index
            swap.1 drop  # [waddr, offset]
            # load three elements containing the double-word on the stack
            padw movup.4 mem_loadw # [w3, w2, w1, w0, offset]
            drop                   # [w2, w1, w0, offset]
            # move into stack order (hi bytes first)
            swap.2                 # [w0, w1, w2, offset]
            # re-align it, and we're done; realign_dw gets [w0, w1, w2, offset]
            exec.realign_dw
        else
            # check if we start in the second element
            dup.1 eq.1
            if.true
                # memory layout: [lo, mid, hi, <unused>]
                # drop the index
                swap.1 drop
                # load three elements containing the double-word on the stack
                padw movup.4 mem_loadw # [w3, w2, w1, w0, offset]
                movup.3 drop           # [w3, w2, w1, offset]
                # move into stack order
                swap.2                 # [w1, w2, w3, offset]
                # re-align it, and we're done; realign_dw gets [w1, w2, w3, offset]
                exec.realign_dw
            else
                # check if we start in the third element
                swap.1 eq.2 # [waddr, offset]
                if.true
                    # memory layout: [mid, hi, ..] [<unused>, <unused>, <unused>, lo]
                    # load one element from the next word
                    dup.0 u32overflowing_add.1 assertz # [waddr + 1, waddr, offset]
                    mem_load            # [chunk_lo, waddr, offset]
                    # load two elements from the first word
                    padw movup.5        # [waddr, 0, 0, 0, 0, chunk_lo, offset]
                    mem_loadw           # [chunk_mid, chunk_hi, ?, ?, chunk_lo, offset]
                    swap.3 drop         # [chunk_hi, ?, chunk_mid, chunk_lo, offset]
                    swap.1 drop         # [chunk_hi, chunk_mid, chunk_lo, offset]
                    # re-align it, and we're done
                    exec.realign_dw
                else
                    # memory layout: [hi, ..], [<unused>, <unused>, lo, mid]
                    # load the two least-significant elements from the next word first
                    dup.0 u32overflowing_add.1 assertz # [waddr + 1, waddr, offset]
                    padw movup.4         # [waddr + 1, 0, 0, 0, 0, waddr, offset]
                    mem_loadw drop drop  # [lo, mid, waddr, offset]
                    swap.1 # [mid, lo, waddr, offset]
                    # load the most significant element from the first word
                    padw movup.6 # [waddr, 0, 0, 0, 0, mid, lo, offset]
                    mem_loadw movdn.3 drop drop drop # [hi, mid, lo, offset]
                    # re-align it, and we're done
                    exec.realign_dw
                end
            end
        end
    end
end

# Given an element index, a new element, and a word, in that order, replace the element
# at the specified index, leaving the modified word on top of the stack
#
# The element index must be in the range 0..=3.
export.replace_element # [element_index, value, w3, w2, w1, w0]
    # assert the index given is valid
    dup.0 push.3 lte assert
    # compute a set of three booleans which used in conjunction with cdrop will
    # extract the desired value for each element of the given word
    movup.2 dup.2         # [value, w3, element_index, value, w2, ..w0]
    dup.2 push.3 eq cdrop # [w3', element_index, value, w2, ..w0]
    movdn.5               # [element_index, value, w2, ..w0, w3']
    movup.2 dup.2
    dup.2 push.2 eq cdrop # [w2', element_index, value, w1, w0, w3']
    movdn.5               # [element_index, value, w1, w0, w3', w2']
    movup.2 dup.2
    dup.2 push.1 eq cdrop
    movdn.5               # [element_index, value, w0, w3', w2', w1']
    # on the last element, consume the element index and replacement value
    push.0 eq cdrop       # [w0', w3', w2', w1']
    movdn.3               # [w3', w2', w1', w0']
end

# See `store_felt` for safe usage
proc.store_felt_unchecked # [waddr, index, value]
    # prepare the stack to receive the loaded word
    # [waddr, 0, 0, 0, 0, waddr, index, value]
    padw dup.4
    # load the original word
    mem_loadw  # [w3, w2, w1, w0, waddr, index, value]

    # rewrite the desired element
    movup.6 # [value, w3, w2, w1, w0, waddr, index]
    movup.6 # [index, value, w3, w2, w1, w0, waddr]
    exec.replace_element # [w3', w2', w1', w0', waddr]

    # store the updated word
    movup.4 mem_storew
    dropw
end

# Store a field element to the given native pointer triplet.
#
# A native pointer triplet consists of a word address which contains the
# start of the data; an element index, which indicates which element of
# the word the data starts in; and a byte offset, which indicates which
# byte is the start of the data.
#
# A field element must be naturally aligned, i.e. it's byte offset must be zero.
export.store_felt # [waddr, index, offset, value]
    # assert the pointer is felt-aligned, then load
    movup.2 assertz exec.store_felt_unchecked
end

# Store a single 32-bit machine word from the given native pointer triplet.
#
# A native pointer triplet consists of a word address which contains the
# start of the data; an element index, which indicates which element of
# the word the data starts in; and a byte offset, which indicates which
# byte is the start of the data.
export.store_sw # [waddr, index, offset, value]
    # check for alignment and offset validity
    dup.2 eq.0
    dup.3 push.8 u32lt assert # offset must be < 8
    # if the pointer is naturally aligned..
    if.true
        # drop the byte offset
        movup.2 drop
        # load the element containing the data we want
        exec.store_felt_unchecked
    else
        # check if the store starts in the first element
        dup.1 eq.0
        if.true
            # the store is across both the first and second elements
            # drop the element index
            swap.1 drop
            # load current value
            padw dup.4 mem_loadw # [w3, w2, w1, w0, waddr, offset, value]

            # compute the bit shift
            push.32 dup.6 sub    # [rshift, w3..w0, waddr, offset, value]

            # compute the masks
            push.4294967295 dup.1 u32shl  # [mask_hi, rshift, w3..w0, waddr, offset, value]
            dup.0 u32not                  # [mask_lo, mask_hi, rshift, w3, w2, w1, w0, waddr, offset, value]

            # manipulate the bits of the two target elements, such that the 32-bit word
            # we're storing is placed at the correct offset from the start of the memory
            # cell when viewing the cell as a set of 4 32-bit chunks
            movup.5 u32and         # [w1_masked, mask_hi, rshift, w3, w2, w0, waddr, offset, value]
            movup.5 movup.2 u32and # [w0_masked, w1_masked, rshift, w3, w2, waddr, offset, value]

            # now, we need to shift/mask/split the 32-bit value into two elements, then
            # combine them with the preserved bits of the original contents of the cell
            #
            # first, the contents of w0
            dup.7 movup.7 u32shr u32or   # [w0', w1_masked, rshift, w3..w2, waddr, value]
            # then the contents of w1
            swap.1
            movup.6 movup.3 u32shl u32or # [w1', w0', w3, w2, waddr]

            # ensure word is in order
            movup.3 movup.3              # [w3, w2, w1', w0', waddr]

            # finally, write back the updated word, and clean up the operand stack
            movup.4 mem_storew dropw
        else
            # check if the load starts in the second element
            dup.1 eq.1
            if.true
                # the load is across both the second and third elements
                # drop the element index
                swap.1 drop

                # load current value
                padw dup.4 mem_loadw # [w3, w2, w1, w0, waddr, offset, value]

                # compute the bit shift
                push.32 dup.6 sub    # [rshift, w3..w0, waddr, offset, value]

                # compute the masks
                push.4294967295 dup.1 u32shl  # [mask_hi, rshift, w3..w0, waddr, offset, value]
                dup.0 u32not                  # [mask_lo, mask_hi, rshift, w3, w2, w1, w0, waddr, offset, value]

                # manipulate the bits of the two target elements, such that the 32-bit word
                # we're storing is placed at the correct offset from the start of the memory
                # cell when viewing the cell as a set of 4 32-bit chunks
                movup.4 u32and         # [w2_masked, mask_hi, rshift, w3, w1, w0, waddr, offset, value]
                movup.4 movup.2 u32and # [w1_masked, w2_masked, rshift, w3, w0, waddr, offset, value]

                # now, we need to shift/mask/split the 32-bit value into two elements, then
                # combine them with the preserved bits of the original contents of the cell
                #
                # first, the contents of w1
                dup.7 movup.7 u32shr u32or   # [w1', w2_masked, rshift, w3, w0, waddr, value]
                # then the contents of w2
                swap.1
                movup.6 movup.3 u32shl u32or # [w2', w1', w3, w0, waddr]

                # ensure the elements are in order
                movup.3 swap.3  # [w3, w2', w1', w0, waddr]

                # finally, write back the updated word, and clean up the operand stack
                movup.4 mem_storew dropw
            else
                # check if the load starts in the third element
                swap.1 eq.2
                if.true
                    # the load is across both the third and fourth elements
                    # load current value
                    padw dup.4 mem_loadw # [w3, w2, w1, w0, waddr, offset, value]

                    # compute the bit shift
                    push.32 dup.6 sub    # [rshift, w3..w0, waddr, offset, value]

                    # compute the masks
                    push.4294967295 dup.1 u32shl  # [mask_hi, rshift, w3..w0, waddr, offset, value]
                    dup.0 u32not                  # [mask_lo, mask_hi, rshift, w3, w2, w1, w0, waddr, offset, value]

                    # manipulate the bits of the two target elements, such that the 32-bit word
                    # we're storing is placed at the correct offset from the start of the memory
                    # cell when viewing the cell as a set of 4 32-bit chunks
                    movup.3 u32and         # [w3_masked, mask_hi, rshift, w2, w1, w0, waddr, offset, value]
                    movup.3 movup.2 u32and # [w2_masked, w3_masked, rshift, w1, w0, waddr, offset, value]

                    # now, we need to shift/mask/split the 32-bit value into two elements, then
                    # combine them with the preserved bits of the original contents of the cell
                    #
                    # first, the contents of w2
                    dup.7 movup.7 u32shr u32or   # [w2', w3_masked, rshift, w1, w0, waddr, value]
                    # then the contents of w3
                    swap.1
                    movup.6 movup.3 u32shl u32or # [w3', w2', w1, w0, waddr]

                    # finally, write back the updated word, and clean up the operand stack
                    movup.4 mem_storew dropw
                else
                    # the load crosses a word boundary, start with the word containing the highest-addressed bits

                    # compute the address for the second word
                    dup.0  # [waddr, waddr, offset, value]
                    u32overflowing_add.1 assertz # [waddr + 1, waddr, offset, value]

                    # load the element we need to mix bits with
                    mem_load  # [w0, waddr, offset, value]

                    # compute the bit shift
                    push.32 dup.3 sub    # [rshift, w0, waddr, offset, value]

                    # compute the masks
                    push.4294967295 dup.1 u32shl  # [mask_hi, rshift, w0, waddr, offset, value]
                    dup.0 u32not                  # [mask_lo, mask_hi, rshift, w0, waddr, offset, value]

                    # mask out the bits of the value that are being overwritten
                    movup.3 u32and # [w0', mask_hi, rshift, waddr, offset, value]

                    # extract the bits to be stored in this word
                    dup.5 movup.3 u32shl u32or # [w0'', mask_hi, waddr, offset, value]

                    # store the updated element
                    dup.2 add.1 # [waddr + 1, w0'', mask_hi, waddr, offset, value]
                    mem_store # [mask_hi, waddr, offset, value]

                    # next, update the last element of the lowest addressed word
                    padw dup.5 mem_loadw # [w3, w2, w1, w0, mask_hi, waddr, offset, value]

                    # mask out the bits of the value that are being overwritten
                    movup.4 u32and # [w3_masked, w2, w1, w0, waddr, offset, value]

                    # extract the bits to be stored in this word and combine them
                    movup.6 movup.6 u32shr u32or # [w3', w2, w1, w0, waddr]

                    # write updated word
                    movup.4 mem_storew

                    # clean up operand stack
                    dropw
                end
            end
        end
    end
end

# Store a double 32-bit machine word from the given native pointer triplet.
#
# A native pointer triplet consists of a word address which contains the
# start of the data; an element index, which indicates which element of
# the word the data starts in; and a byte offset, which indicates which
# byte is the start of the data.
export.store_dw # [waddr, index, offset, value_hi, value_lo]
    # check for alignment and offset validity
    dup.2 eq.0
    dup.3 push.8 u32lt assert # offset must be < 8
    # convert offset from bytes to bits
    movup.3 push.8 u32wrapping_mul movdn.3 # [offset == 0, waddr, index, offset, value_hi, value_lo]
    # if the pointer is naturally aligned..
    if.true
        # drop byte offset
        movup.2 drop # [waddr, index, value_hi, value_lo]
        # check which element to start at
        dup.1 eq.0
        if.true
            # drop index
            swap.1 drop    # [waddr, value_hi, value_lo]
            swap.2         # [value_lo, value_hi, waddr]
            padw dup.6 mem_loadw # [w3, w2, w1, w0, value_lo, value_hi, waddr]
            swap.2 drop    # [w2, w3, w0, value_lo, value_hi, waddr]
            swap.2 drop    # [w3, w2, value_lo, value_hi, waddr]
            movup.4        # [waddr, w3, w2, value_lo, value_hi]
            mem_storew
            # cleanup the operand stack
            dropw
        else
            dup.1 eq.1
            if.true
                # drop index
                swap.1 drop      # [waddr, value_hi, value_lo]
                # store as the second and third elements of the word
                swap.2           # [value_lo, value_hi, waddr]
                padw dup.6 mem_loadw # [w3, w2, w1, w0, value_lo, value_hi, waddr]
                movup.4 swap.2 drop  # [w3, value_lo, w1, w0, value_hi, waddr]
                movup.4 swap.3 drop  # [w3, value_lo, value_hi, w0, waddr]
                movup.4 mem_storew
                # cleanup the operand stack
                dropw
            else
                swap.1 eq.2
                if.true
                    # store as the third and fourth elements of the word
                    swap.2                   # [value_lo, value_hi, waddr]
                    padw dup.6 mem_loadw     # [w3, w2, w1, w0, value_lo, value_hi, waddr]
                    movup.5 swap.2 drop      # [w3, value_hi, w1, w0, value_lo, waddr]
                    drop movup.3             # [value_lo, value_hi, w1, w0, waddr]
                    movup.4 mem_storew
                    # cleanup the operand stack
                    dropw
                else
                    # store the first element of the next word
                    swap.2                             # [value_lo, value_hi, waddr]
                    dup.2 u32overflowing_add.1 assertz # [waddr + 1, value_lo, value_hi, waddr]
                    mem_store  # [value_hi, waddr]
                    # store the fourth element
                    padw dup.5 mem_loadw  # [w3, w2, w1, w0, value_hi, waddr]
                    drop movup.3 movup.4  # [waddr, value_hi, w2, w1, w0]
                    mem_storew dropw
                end
            end
        end
    else # unaligned; an unaligned double-word spans three elements
        # [waddr, index, offset, value_hi, value_lo]
        movup.2 # [offset, waddr, index, value_hi, value_lo]
        movup.4 # [value_lo, offset, waddr, index, value_hi]
        movup.4 # [value_hi, value_lo, offset, waddr, index]
        exec.offset_dw # [chunk_lo, chunk_mid, chunk_hi, waddr, index]
        movup.4 # [index, chunk_lo, chunk_mid, chunk_hi, waddr]
        # check if we start in the first element
        dup.0 eq.0
        if.true
            # target memory layout: [0, lo, mid, hi]
            # drop the index
            drop                 # [lo, mid, hi, waddr]
            padw dup.7 mem_loadw # [w3, w2, w1, w0, lo, mid, hi, waddr]
            movdn.3        # [w2, w1, w0, w3, lo, mid, hi, waddr]
            drop drop drop # [w3, lo, mid, hi, waddr]
            movup.4 mem_storew
            dropw
        else
            # check if we start in the second element
            dup.0 eq.1
            if.true
                # target memory layout: [lo, mid, hi, 0]
                # drop the index
                drop           # [lo, mid, hi, waddr]
                padw dup.7 mem_loadw # [w3, w2, w1, w0, lo, mid, hi, waddr]
                drop drop drop # [w0, lo, mid, hi, waddr]
                movdn.3        # [lo, mid, hi, w0, waddr]
                movup.4 mem_storew
                dropw
            else
                # check if we start in the third element
                eq.2 # [lo, mid, hi, waddr]
                if.true
                    # target memory layout: [mid, hi, ..], [..lo]
                    padw dup.7 mem_loadw # [w3, w2, w1, w0, lo, mid, hi, waddr]
                    drop drop movup.4 movup.4 # [mid, hi, w1, w0, lo, waddr]
                    dup.5 mem_storew dropw # [lo, waddr]
                    swap.1 u32overflowing_add.1 assertz # [waddr + 1, lo]
                    mem_store
                else
                    # target memory layout: [hi, ..], [..lo, mid]
                    padw dup.7 mem_loadw # [w3, w2, w1, w0, lo, mid, hi, waddr]
                    drop movup.5 # [hi, w2, w1, w0, lo, mid, waddr]
                    dup.6 mem_storew dropw # [lo, mid, waddr]
                    movup.2 u32overflowing_add.1 assertz # [waddr + 1, lo, mid]
                    dup.0 movdn.3 # [waddr + 1, lo, mid, waddr + 1]
                    padw movup.4 mem_loadw # [w3, w2, w1, w0, lo, mid, waddr + 1]
                    movup.5 swap.4 drop # [w3, w2, w1, mid, lo, waddr + 1]
                    movup.4 swap.3 drop # [w3, w2, lo, mid, waddr + 1]
                    movup.4 mem_storew dropw
                end
            end
        end
    end
end