\ This is an adaption of the matrix multiplication benchmark for using
\ run-time code generation (inspired by lee&leone96)

\ @InProceedings{lee&leone96,
\   author = 	 {Peter Lee and Mark Leone},
\   title = 	 {Optimizing ML with Run-Time Code Generation},
\   crossref =	 {sigplan96},
\   pages =	 {137--148}
\ }
\ @Proceedings{sigplan96,
\   booktitle = 	 "SIGPLAN '96 Conference on Programming Language
\ 		  Design and Implementation",
\   title = 	 "SIGPLAN '96 Conference on Programming Language
\ 		  Design and Implementation",
\   year = 	 "1996",
\   key = 	 "PLDI '96"
\ }

\ The original version is in comments.
\ The results with Gforth on a Nekotech Mach2 (300MHz 21064a) are very nice:
\ original program:		 6.2s user time
\ with run-time code generation: 3.9s user time
\ NOTE: This version needs 160,000+ cells data space
\	and a lot of code space, too.

\ A classical benchmark of an O(n**3) algorithm; Matrix Multiplication
\
\ Part of the programs gathered by John Hennessy for the MIPS
\ RISC project at Stanford. Translated to forth by  Marty Fraeman,
\ Johns Hopkins University/Applied Physics Laboratory.

\ MM forth2c doesn't have it !
: mybounds  over + swap ;
: under+ ( a x b -- a+b x )
   rot + swap ;

1 cells constant cell

variable seed

: initiate-seed ( -- )  74755 seed ! ;
: random  ( -- n )  seed @ 1309 * 13849 + 65535 and dup seed ! ;

200 constant row-size
row-size cells constant row-byte-size

row-size row-size * constant mat-size
mat-size cells constant mat-byte-size

align create ima mat-byte-size allot
align create imb mat-byte-size allot
align create imr mat-byte-size allot

: initiate-matrix ( m[row-size][row-size] -- )
  mat-byte-size mybounds do
    random dup 120 / 120 * - 60 - i !
  cell +loop
;

: gen-innerproduct ( a[row][*] -- xt )
\ xt is of type ( b[*][column] -- n )
\ this would be a candidate for using ]] ... [[
 >r :noname r>
 0 POSTPONE literal POSTPONE SWAP
 row-size 0 do
   POSTPONE dup POSTPONE @
   dup @ POSTPONE literal POSTPONE * POSTPONE under+
   POSTPONE cell+ row-byte-size +
  loop
  drop
 POSTPONE drop POSTPONE ;
;

\ : innerproduct ( a[row][*] b[*][column] -- int)
\   0 row-size 0 do ( a b int )
\     >r over @ over @ * r> + >r
\     cell+ swap row-byte-size + swap
\     r>
\   loop
\   >r 2drop r>
\ ;

: main  ( -- )
  initiate-seed
  ima initiate-matrix
  imb initiate-matrix 
  imr ima mat-byte-size mybounds do
   i gen-innerproduct swap
    imb row-byte-size mybounds do ( r xt )
      i 2 pick execute over ! cell+
    cell +loop
    nip \ !! forget the xt
  row-size cells +loop
  drop
;

\ : main  ( -- )
\   initiate-seed
\   ima initiate-matrix
\   imb initiate-matrix 
\   imr ima mat-byte-size mybounds do
\     imb row-byte-size mybounds do
\       j i innerproduct over ! cell+ 
\     cell +loop
\   row-size cells +loop
\   drop
\ ;