4,887,235
	293	294
(definst numberp no-operand
  (if (data-type? top-of-stack-a dtp-fix dtp-float dtp-extended-number)
      (goto true1)
      (goto false1)))

(definst symbolp no-operand
  (if (data-type? top-of-stack-a dtp-symbol dtp-nil)
      (goto true1)
      (goto false1)))

(definst arrayp no-operand
  (if (data-type? top-of-stack-a dtp-array)
      (goto true1)
    (goto false1)))


F:>LMach>Ucode>NET.LISP.71

;;; -*- Mode:Lisp; Package:Micro; Base:8; Lowercase:yes -*-
;;; (c)	Copyright 1982, Symbolics, Inc.

(reserve-scratchpad-memory 2520 2531 314 324)

(associate-dispatch-cues net-micro-status *net-micro-status-codes*)
(define-enumerated-value-constants *net-micro-status-codes*)
(defatomic-byte-field net-micro-status (4 0) %net-micro-status)

(defareg %net-block-pointer)			;Pointer to next block
(defareg %net-memory-address)			;Address in this block
(defareg %net-word-count)			;Word count of this block

;; Packet we are receiving into or -1
(defareg %net-packet-being-received (set-type -1 dtp-fix))

;; Packet we are transmitting or -1
(defareg %net-packet-being-transmitted (set-type -1 dtp-fix))

(defareg %net-control-address)			;Address of the control register
(defareg net-dma-temp)

;;; A network unit is 512 bit times, but the board times 128 bit times, so that
;;; we must multiply by 4
(defareg %net-backoff-count)			;l2us units to back off
(defareg %net-next-backoff)			;Mask of units to back off
						;betwecn 2^n-1 where n is
						;the nth retransmission + 2

(defbreg %net-address-1)			;Our net address
(defbreg %net-address-2)
(defbreg net-b-temp)

(defmicro set-net-status (net-status-code)
  'assign %net-micro-status (set-type ,net-status-code dtp-fix)))

;Wakeup the net service task
;This is called in the DMA task usually, but can also be called by the emulator
(defmicro wakeup-net-service ()
  '(parallel (assign service-task-requests
		     (logior service-task-requests
			     (b-constant (byte-mask %%service-net))))
	     (wakeup-task %device-service-task)
	     ))


(defmicro terminate-net-dma (net-status-code &optional (end-p t))
  '(sequential
    (set-net-status ,net-status-code)
    (net-control nil ,end-p)
    (parallel (wakeup-net-service)
	      (jump net-dma-dead))))

(defmicro start-net-dma (location)
  '(write-task-state %net-dma-task
		     (a-constant '(build-task-state cpc ,location
						    npc (npc-successor ,location)
						    csp 17))))

(defmicro io-board-bug-delay ()
  '(parallel (disable-tasking)
	     (declare-memory-timing (next active-cycle))))

(eval-when (compile load eval)
(defun net-buffer-address (dma-p dismiss-p end-p)
  (logior (if dma-p 1 0)
	  (if dismiss-p 2 0)
	  (if end-p 4 0)
	  10))
);eval-when compile load eval

4,887,235
	295	296
(defmicro read-net-buffer (&optional (dismiss-p nil) (end-p nil))
  (let ((dev-addr (net-buffer-address nil dismiss-p end-p)))
    '(parallel (extra-time-to-drive-lbus)
	       (read-lbus-dev iob ,dev-addr)
	       ,(if dismiss-p '(dismiss)))))

(defmicro service-read-net-buffer (&optional (dismiss-p nil) (end-p nil))
  (let ((dev-addr (net-buffer-address nil dismiss-p end-p)))
    '(parallel (extra-time-to-drive-lbus)
	       (read-lbus-dev iob ,dev-addr))))

(defmicro service-net-control (&optional (dismiss-p nil) (end-p nil))
  (let ((dev-addr (net-buffer-address nil dismiss-p end-p)))
    '(parallel (write-lbus-dev iob ,dev-addr nil))))

(defmicro transmit-dma (addr &optional (dismiss-p t) (end-p nil))
  (let ((dev-addr (net-buffer-address t dismiss-p end-p)))
    '(parallel (start-memory read physical addr dma lob ,dev-addr)
	       ,(if dismiss-p '(dismiss)))))

(defmicro receive-dma (addr &optional (dismiss-p t) (end-p nil))
  (let ((dev-addr (net-buffer-address t dismiss-p end-p)))
    '(parallel (start-memory write physical ,addr dma iob ,dev-addr)
	       (assign ,addr (1+ ,addr))
	       ,(if dismiss-p '(dismiss)))))

(defmicro net-control (&optional (input-p nil) (dismiss-p t) (end-p nil))
  (let ((dev-addr (net-buffer-address nil dismiss-p end-p)))
    '(parallel ,(if input-p
		    '(for-effect (read-lbus-dev lob ,dev-addr))
		    '(write-lbus-dev iob dev-addr nil))
	       ,(if dismiss-p '(dismiss)))))

(defmicro increment (location &optional (fixnum-p t))
  (if fixnum-p
      '(assign ,location (set-type (1+ ,location) dtp-fix))
      '(assign ,location (1+ ,location))))


F:>lmach>ucode>nBITBLT.LISP.22

;;; -*- Mode:Lisp; Package:Micro; Base:8; Lowercase:yes -*-
;;; (c) Copyright 1982, Symbolics, Inc.

;;;; BITBLT microcode for 3600












;;	The pclsring theory:
;;
;; Reads can be repeated with no harmful effects, writes cannot be (in most cases).
;; State is not permanently updated until a write is consummated.
;; After every write, state should be updated so that if the next memory operation
;; faults and pclsrs, that write will not be repeated (the bitblt row will be shorter).
;; To avoid the overhead of doing this for every write, we have block mode
;; operations that only update the state after writing a block of words.
;;
;; For the block mode things, we use a buffer that can be saved. See next+1 page.
;;
;; For the short-row things, when the destination is split across two words,
;; we check write access to both words before modifyin9 either of them.
;: No pclsring problems if the operation depends on neither operand.
;;
:; When there is a partial word at the front, do it and then advance the arguments
;; so the bitblt is word aligned in the destination, When there is a partial word
;; at the end, when we get there the arguments have been advanced.

(reserve-scratchpad-memory 2460 2470 320 330)

(defmicro waiting-for-memory ()		;documentation Only
  '(nop))

(defmicro abus-array-data (&body body)
  '(parallel
    (transport data)
    (check-data-type memory-data dtp-fix)
    ,@body))

4,887,235
	297	298
(defmicro assign-vma-offset (which &rest stuff)
  (selectq which
    (S '(assign vma (+ bb-s-row-addr bb-s-offset ,@stuff)))
    (0 '(assign vma (+ bb-d-row-addr bb-d-offset ,@stuff)))
    (S-ahead '(assign vma (+ bb-s-row-addr bb-s-offset-ahead ,@stuff)))
    (otherwise
     (ferror "assign-vma-offset knows about only S and 0, not ~S which"))))

(defmicro parallel-with-s-access (offset &body body)
  (make-memory-access 'bb-s-row-addr 'bb-s-offset offset body '(read)))

(defmicro parallel-with-d-access (offset &body body)
  (make-memory-access 'bb-d-row-addr 'bb-d-offset offset body '(read)))

(defmicro parallel-with-d-access-check-write (offset &body body)
  (make-memory-access 'bb-d-row-addr 'bb-d-offset offset body (read write)))

(eval-when (eval compile load)
(defun make-memory-access (baseaddr offset-sym offset body memory-modes)
  (or (eq offset offset-sym)
      (equal offset '(1+ ,offset-sym))
      (and (eq offset-sym 'bb-s-offset) (eq offset 'bb-s-offset-ahead))
      (ferror "~S is not a recognized offset for ~S" offset offset-sym))
  (let* ((body (reverse body))
	 (finally '(abus-array-data ,(car body))))
    (do ((ll (reverse
	      '((assign vma ,(if (atom offset)
				 '(+ ,baseaddr ,offset)
			         '(+ ,baseaddr ,(second offset) 1)))
		(start-memory ,@memory-modes)
		(waiting-for-memory)))
	     (cdr ll))
	 (body (cdr body) (cdr body))
	 (l))
	((and (null ll) (null body))
	 '(sequential ,@l ,finally))
	(cond ((null ll) (push (car body) l))
	      ((null body) (push (car ll) l))
	      (T (push '(parallel ,(car ll) ,(car body)) l))))))
);eval-when

(defmicro 31- (operand)
	 '(- (b-constant 31.) ,operand))

(defmicro incr-d-offset ()
  '(assign bb-d-offset (1+ bb-d-offset)))

(defmicro decr-d-offset ()
  '(assign bb-d-offset (1- bb-d-offset)))

(defmicro incr-wrap-s-offset ()
  '(sequential
    (parallel
     (assign bb-s-offset (1+ bb-s-offset))
     (assign b-temp-3 obus))
    (if	(greater-or-equal-fixnum b-temp-3 bb-s-row-length)
	(parallel
	 (lisp (format	T "~&>>>Wrapping around on bb-s-offset from ~d."
			(low32 (tr 'bb-s-offset))))
	 (assign bb-s-offset (b-constant 0)))
      (drop-through))))

(defmicro decr-wrap-s-offset ()
  '(parallel
    (assign bb-s-offset (1- bb-s-offset))
	(if (minus-fixnum obus)
	    (parallel
	     (lisp (format t "~&>>>Decr wrapping around on bb-s-offset"))
	     (assign bb-s-offset (1- bb-s-row-length)))
	  (drop-through))))

(defmicro incr-wrap-s-offset-ahead ()
  (sequential
   (parallel
    (assign bb-s-offset-ahead (1+ bb-s-offset))
    (assign b-temp-3 obus))
   (if (greater-or-equal-fixnum b-temp-3 bb-s-row-length)
       (parallel
	(lisp (format T "~&>>>Wrapping around on bb-s-offset from ~d"
		      (low32 (tr bb-s-offset-ahead))))
	(assign bb-s-offset-ahead (b-constant 0)))
     (drop-through))))

(defmicro decr-wrap-s-offset-ahead ()
  '(parallel
    (assign bb-s-offset-ahead (1- bb-s-offset))
    (if (minus-fixnum obus)
	(parallel
	 (lisp (format t "~&~>>>Decr wrapping around on bb-s-offset"))
	 (assign bb-s-offset-ahead (1- bb-s-row-length)))
      (drop-through))))

4,887,235
	299	300
(defmicro store-word (datum &rest options)
  '(store-contents (set-type ,datum dtp-fix) not-pointer . ,options))

;;---the goddamn simulator compiles
;;    (parallel (assign ...) (return))
;; into
;;    (prog ... (return nil) (setq ...))
(defmicro parallel-with-return (&body stm)
  '(,(if (eq *machine-version* 'sim) 'sequential 'parallel)
    ,@stm
    (return)))

(defmicro via-xbus (source)
  (make-microdata 'xbus (get-to-xbus source)))

(defvar *fp-offset-names* ())

(defmacro def-fp-offsets (&rest names)
  (loop for i upfrom 0
	for name in names
	append '((defatomicro ,name (amem (frame-pointer ,i)))
		 (defprop ,name ,i fp-offset)
		 (or (memq ',name *fp-offset-names*)
		     (push ',name *fp-offset-names*)))
	into foo
	finally (return `(progn 'compile ,@foo))))

;; decode fp offset numbers into symbols. Debugging only.
(defun dfp (&rest numbers)
  (loop for number in numbers
	collect (loop for name in *fp-offset-names*
		      when (equal (get name 'fp-offset) number)
		      return name
		      finally (return number))))

;; Define arguments/state for BITBLT instructions. Note that these must be
;: relative to FP, not to the top of the stack, since there might be a
;; saved bitblt-buffer on the stack if the instruction was interrupted.
(def-fp-offsets
  bb-arg-alu bb-arg-width bb-arg-height		;lisp arg
  bb-arg-from-array bb-arg-from-x bb-arg-from-y ; lisp arg
  bb-arq-to-array bb-arg-to-x bb-arg-to-y	;lisp arg
  bb-width					;ucode arg
  bb-s-data-addr				;ucode arg
  bb-s-row-offset				;ucooe arg
  bb-s-offset					;ucode arg
  bb-s-bitpos					;ucode arg
  bb-s-row-length				;ucode arg
  bb-d-data-addr				;ucode arg
  bb-d-offset					;ucode arg
  bb-d-bitpos					;ucode arg
  bb-event-count				;ucode arg
  bb-alu-operation				;ucode arg
  )

;;; Some temporaries.

(define-b-temps bb-constant		;Value to store or to XOR in
  		bb-s-word		;temp (source word)
		bb-s-row-addr		;start of current source row
		bb-d-row-addr		;start of current destination row
		bb-width-b		;copy of width on B side (sometimes)
		b-block-size)		;number of words in block

(defareg bb-constant-a)			;A-side copy of bb-constant
(defareg bb-identity)			;Background to dpb into when doing part word
(defareg bb-s-word2)			;temp (other source word)
(defareg bb-a-temp)
(defareg bb-s-offset-ahead)		;a-offset not finalized yet (if pclsr)
(defareg a-block-size)			;number of words in block


;;; Bitblt-buffer hair

(eval-when (compile load eval)
(defconst n-bitblt-buffers 8))

;XXXbrad backquote?
#.'(progn 'compile			;B-memory buffer for block-mode operations
     . ,(loop for i from 0 below n-bitblt-buffers
	      collect '(deltreg ,(fintern "BITBLT-BUFFER-~D" i))))

(defmicro bitblt-buffer (i)
  (fintern "BITBLT-BUFFER-~D" i))

;--- this defareg goes in some other file ---
;If this register is non-zero and we pclsr, save-bitblt-buffer must be
;called after restoring the stack pointer.
(defareg bitblt-buffer-active 0)
4,887,235
	301	302
;We first compute the result n words at a time into the bitblt-buffer,
;and then store it into the destination (in one case the whole buffer
;is rotated by 1 to 31 bits as it is being stored).
;The bitblt-buffer is "active" while we are storing it into the destination.
;The bitbit buffer must be active while we are modifying the destination,
;since the words copied into the buffer might overlapped with parts of
;the destination we have already modified.
;
;A pclsr while the bitblt-buffer is active will copy it into
;the stack, set first-part-done, and clear bitblt-buffer-active.
;A restart with first-part-done set will proceed normally until it comes time
;to store the bitblt-buffer. At that time, first-part-done is seen, the
;bitblt-buffer is restored from the stack (replacing the possibly-erroneous
;contents that were just computed), and execution then proceeds normally.
;
;The contents of the bitblt-buffer are assumed to have valid data type tags.
;For now, they could be forced to fixnum, but in the future we may have
;other instructions using this buffer and its save/restore mechanism.
;--- Still need to fix microcompiler to default cdr source from Bbus correctly ---

;Call here if we pclsr with the bitblt-buffer active
(defucode save-bitblt-buffer
  #.'(sequential .
       ,(loop for i from 0 below n-bitblt-buffers
	      collect '(pushval-with-cdr (bitblt-buffer ,i))))
  (assign first-part-done (b-constant 1))
  (parallel
   (assign bitblt-buffer-active (b-constant 0))
   (return)))

;Call here when about to start storing the bitblt-buffer
;This is actually a micro so that the first instruction of the routine
;gets open-coded into the caller
;This is hairiiy bummed to make the normal case go in only one cycle
;(if the trap is not taken then the obus has -1 on it)
(defmicro activate-bitblt-buffer ()
  '(parallel
    (assign bitblt-buffer-active obus)
    (trap-if (bit-test frame-misc-data (b-constant (byte-mask first-part-done)))
	     activate-saved-bitblt-buffer)))

;We also need this closed-subroutine version
(defucode activate-bitblt-buffer
  (parallel
   (activate-bitblt-buffer)
   (return)))

(defucode activate-saved-bitblt-buffer
  (parallel
   (trap-save)
   #.'(sequential				;Retry the assign,trap-if upon return
       ,@(loop for i from (1- n-bitblt-buffers) downto 0
	       collect '(parallel
			 (assign (bitblt-buffer ,i) top-of-stack-a)
			 (decrement-stack-pointer)))))
  (parallel
   (assign first-part-done (b-constant 0))
   (return)))

;Call here when done storing the bitblt-buffer
(defucode deactivate-bitblt-buffer
  (parallel
   (assign bitblt-buffer-active (b-constant 0))
   (assign top-of-stack top-of-stack-a)		;Could hove been bashed by activate...
   (return)))

(defmicro read-bb-s-word ()
  '(parallel
    (assign a-temp (+ bb-width-b bb-s-bitpos))
    (call read-bb-s-word1)))

;a-temp has the number of in bits needed relative to bit 0 of the first word
(defucode read-bb-s-word1
;XXXbrad s?
  (assign-vma-offset s)
  (parallel
   (assign byte-r (32- bb-s-bitpos))
   (start-memory read))
  (parallel
   (waiting-for-memory)
   (if (lesser-or-equal-fixnum a-temp (b-constant 32.))
       ;;source is entrely within one word
       (parallel-with-return
	(abus-array-data
	 (assign bb-s-word (logxor bb-constant (rotate memory-data byte-r)))))
     ;;source is split across two words
     (sequential
      (abus-array-data
       (assign bb-s-word (rotate memory-data byte-r)))
      (incr-wrap-s-offset-ahead)
      (assign-vma-offset s-ahead)
4,887,235
	303	304
	(parallel
	 (start-memory read)	;byte-r is already Ok
	 )
	(parallel
	 (waiting-for-memory)
	 (assign byte-s (1- a-temp)))
	(abus-array-data
	 (assign bb-s-word (dpb memory-data byte-s byte-r bb-s-word)))
	(parallel-with-return
	 (assign bb-s-word (logxor bb-s-word bb-constant-a)))))))

;;Assumptions about setup:
;;bb-constant has:
;;  >> for constant cperations (0,-1): the constant;
;;  >> for operations dependent only on source or destination (x, ~x, y, ~y)
;;     a 0 for x,y or -1 for ~x,~y;
;;  >> for operations dependent on both a and d: 0 for those using source directly,
;;     and -1 for those that want the source complementod.

(defucode bb-copy-stuff-to-b-side
  (assign bb-s-row-addr (+ bb-s-data-addr b-temp))
  (parallel-with-return
   (assign bb-d-row-addr bb-d-data-addr)))

(defmacro definst-bitblt (name source destination neither both)
  '(definst ,name no-operand
     (parallel (assign b-temp bb-s-row-offset)
	       (call bb-copy-stuff-to-b-side))
     (dispatch-after-this (parallel (ldb bb-alu-operation 4 0)
				    ;; Set up constant needed for the most common case
				    (assign bb-constant (via-xbus (b-constant 0)))
				    (assign bb-constant-a (via-xbus (b-constant 0))))
			  (assign bb-width-b bb-width)
       ((0)			;0
	(goto ,neither))
       ((1)			;x*y
	(parallel (assign bb-identity (a-constant -1))
		  (jump ,both)))
       ((2)			;~x*y
	(assign bb-identity (a-constant -1))
	(parallel (assign bb-constant (a-constant -1)) (assign bb-constant-a (a-constant -1))
		  (jump ,both)))
       ((3) (return))		;y
       ((4)			;x*~y
	(parallel (assign bb-identity (a-constant -1))
		  (jump ,both)))
       ((5) (goto ,source))
       ((6)			;x xor y
	(parallel (assign bb-identity (a-constant 0))
		  (jump ,both)))
       ((7)			;x+y
	(parallel (assign bb-identity (a-constant 0))
		  (jump ,both)))
       ((8.)			;~x*~y
	(assign bb-identity (a-constant -1))
	(parallel (assign bb-constant (a-constant -1)) (assign bt-constant-a (a-constant -1))
		  (jump ,both)))
       ((9.)			;~x xor y
	(assign bb-identity (a-constant 0))
	(parallel (assign bb-constant (a-constant -1)) (assign bb-constant-a (a-constant -1))
		  (jump ,both)))
       ((10.)			;~x
	(parallel (assign bb-constant (a-constant -1)) (assign bb-constant-a (a-constant -1))
		  (jump ,source)))
       ((11.)			;~x+y
	(assign bb-identity (a-constant 0))
	(parallel (assign bb-constant (a-constant -1)) (assign bb-constant-a (a-constant -1))
		  (jump ,both)))
       ((12.)			;~y
	(parallel (assign bb-constant (a-constant -1)) (assign bb-constant-a (a-constant -1))
		  (jump ,destination)))
       ((13.)			;x+~y actually ~(~x*y)
	(assign bb-identity (a-constant -1))
	(parallel (assign bb-constant (a-constant -1)) (assign bb-constant-a (a-constant -1))
		  (jump ,both)))
       ((14.)			;~x+~y actually ~(x*y)
	(parallel (assign bb-identity (a-constant -1))
		  (jump ,both)))
       ((15.)			;-1
	(parallel (assign bb-constant (a-constant -1)) (assign bb-constant-a (a-constant -1))
		  (jump ,neither))))))

(definst-bitblt %bitblt-short-row
  ubitblt-short-row-source
  ubitblt-short-row-destination
  ubitblt-short-row-neither
  ubitblt-short-row-both)
4,887,235
	305	306

(definst-bitblt %bitblt-long-row
  ubitblt-long-row-source
  ubitblt-long-row-destination
  ubitblt-long-row-neither
  ubitblt-long-row-both)

(definst-bitblt %bitblt-long-row-backwards
  ubitblt-long-row-source-backwards
  ubitblt-long-row-destination			;direction immaterial
  ubitblt-long-row-neither
  ubitblt-long-row-both-backwards)

(defucode ubitblt-short-row-source
  (read-bb-s-word)
  (assign a-temp (+ bb-width-b bb-d-bitpos))
  (parallel
   (assign byte-s (- a-temp (b-constant 32.) 1))
   (if (lesser-or-equal-fixnum-unsigned a-temp (b-constant 32.))
       ;; destination is entirely within one word
       (parallel-with-d-access bb-d-offset
	 (assign byte-s (1- bb-width))
	 (assign byte-r bb-d-bitpos)
	 (parallel-with-return
	   (store-word (dpb bb-s-word byte-s byte-r memory-data))))
     :; destination is split across two words
     ;; must access-check them both before modifying either
     (sequential
      ;; compute the high byte
      (parallel-with-d-access-check-write (1+ bb-d-offset)
	(assign byte-r bb-d-bitpos)
	(assign a-temp (ldb bb-s-word byte-s byte-r memory-data)))
      ;; compute and store the low byte
      (parallel-with-d-access bb-d-offset
	(assign byte-s (31- bb-d-bitpos))
	(store-word (dpb bb-s-word byte-s byte-r memory-data) block))
      ;; now store the high byte. This cannot fault
      (parallel-with-return
       (store-word a-temp block))))))

(defucode ubitblt-short-row-destination
  (assign a-temp (+ bb-width-b bb-d-bitpos))
  (parallel
   (assign byte-s (- a-temp (b-constant 32.) 1))
   (if (lesser-or-equal-fixnum-unsigned a-temp (b-constant 32.))
       ;; destination is entirely within one word
       (parallel-with-d-access bb-d-offset
         (assign byte-s (1- bb-width))
	 (assign byte-r bb-d-bitpos)
	 (parallel-with-return
	  (store-word (logxor (dpb bb-constant byte-s byte-r 0) memory-data))))
     ;; destination is split across two words
     ;; must access-check them both before modifying either
     (sequential
      ;; compute the hich byte
      (parallel-with-d-access-check-write (1+ bb-d-offset)
	(assign type-r (a-constant 0))
	(assign a-temp (logxor (ldb bb-constant byte-s byte-r) memory-data)))
      ;; compute and store the low byte
      (parallel-with-d-access bb-d-offset
	(assign byte-s (31- bb-d-bitpos))
	(assign byte-r bb-d-bitpos)
	(store-word (logxor (dpb bb-constant byte-s byte-r 0) memory-data) block))
      ;; now store the high byte. This cannot fault
      (parallel-with-return
       (store-word a-temp block))))))

;; The alu operation is actually a constant
(defucode ubitblt-short-row-neither
  (assign a-temp (+ bb-width-b bb-d-bitpos))
  (if (lesser-or-equal-fixnum a-temp (b-constant 32.))
      ;; destination is entirely within one word
      (parallel-with-d-access bb-d-offset
        (assign byte-s (1- bb-width))
	(assign byte-r bt-d-bitpos)
	(parallel-with-return
	 (store-word (dpb bb-constant byte-s byte-r memory-data))))
    ;; destination is split across two words, but no pclsr problems since doing
    ;; the operation twice produces the same effect
    (sequential
     ;; store the low byte
     (parallel-with-d-access bb-d-offset
       (assign byte-s (31- bb-d-bitpos))
       (assign byte-r bb-d-bitpos)
       (store-word (dpb bb-constant byte-s byte-r memory-data)))
     ;; store the high byte
     (parallel-with-d-access (1+ bb-d-offset)
       (assign byte-s (1- a-temp))
       (assign byte-r (a-constant 0))
       (parallel-with-return
	(store-word (dpb bb-constant byte-s byte-r memory-data)))))))

4,887,235
	307	308
;; The alu operation depends upon both source and destination bits
(defucode ubitblt-short-row-both
  (read-bb-s-word)
  (assign a-temp (+ bb-width-b bb-d-bitpos))
  (if (lesser-or-equal-fixnum a-temp (b-constant 32.))
      ;; destination is entirely within one word
      (sequential
       (assign byte-s (1- bb-width))
       (assign byte-r bb-d-bitpos)
       (parallel
	(assign-vma-offset d)
	(jump bb-byte-alu-operation-dispatch)))		;jcall
    ;; destination is split across two words
    (sequential
      ;; make sure we have write access to the high byte so no pclsr after storing low
      (assign-vma-offset d 1)
      (start-memory read write)
      ;; store the low byte
      (assign byte-s (31- bb-d-bitpos))
      (assign byte-r bb-d-bitpos)
      (parallel
        (assign-vma-offset d)
	(call bb-byte-alu-operation-dispatch))
      ;; store the high byte
      (assign bb-s-word (rotate bb-s-word byte-r))
      (assign byte-s (1- a-temp))
      (assign byte-r (b-constant 0))
      (parallel
        (assign-vma-offset d 1)
	(jump bb-byte-alu-operation-dispatch)))))		;jcall

;(boole	fn x y ...)   if fn is "abcd" then
;          y    0	1	2	3	4	5	6	7
;       | 0  1  0	x*y	~x*y	y	x*~y	x	x#y	x+y
;   ----------
;     0 | a  c  8	9	l0	11	12	13	14	15
;   x   |	~(x+y)	~(x#y)	~x	~x+y	~y	x+~y	~x+~y	-1
;     1 | b  d

;;vma and byte regs have been set up already, for DPB.
;;trashes a-temp-2, b-temp-2, b-temp-3, but not a-temp and b-temp.
(defucode bb-byte-alu-operation-dispatch
  (dispatch-after-this (parallel (start-memory read) (ldb bb-alu-operation 4 0))
		       (parallel
			(assign b-temp-3 (dpb bb-s-word byte-s byte-r bb-identity))
			(waiting-for-memory))
     ((1 2) ;;l x*y logand ;l2 ~x*y	logand
      (parallel-with-return
       (parallel
	(declare-memory-timing data-cycle)
	(abus-array-data
	  (store-word (logand memory-data b-temp-3))))))
     ((4 8.) ;;4 ~(~x+y) = x*~y andc2 ;;8 ~(x+y)	= ~xx*~y andcb
      (parallel
       (declare-memory-timing data-cycle)
       (abus-array-data
	(assign a-temp-2 memory-data)))
      (assign b-temp-2 (dpb (b-constant -1) byte-s byte-r 0))	;can't merge this...
      (assign a-temp-2 (logxor a-temp-2 b-temp-2))		;...with this.
      (parallel-with-return
       (store-word (logand a-temp-2 b-temp-3))))
     ((6 9.) ;;6 x#y logxor ;;9 ~(x#y)=~x#y logxor
      (parallel-with-return
       (parallel
	(declare-memory-timing data-cycle)
	(abus-array-data
	 (store-word (logxor b-temp-3 memory-data))))))
     ((7 11.) ;;7 x+y logior	;;11 ~x+y logior
      (parallel-with-return
       (parallel
	(declare-memory-timing data-cycle)
	(abus-array-data
	 (store-word (logior b-temp-3 memory-data))))))
     ((13. 14.) ;;13 x+~y = ~(~x*y) lognand	;;14 ~x+~y=~(x*y)
       (parallel
	(declare-memory-timing data-cycle)
	(abus-array-data
	 (assign a-temp-2 (logand b-temp-3 memory-data))))
       (parallel-with-return
	(store-word (logxor (dpb (b-constant -1) byte-s byte-r 0) a-temp-2))))))

;;vma has been set up already
(defucode bt-word-alu-operation-dispatch	;commonly 3 cycles (plus 1 for the call)
  (dispatch-after-this (parallel (start-memory read) (ldb bt-alu-operation 4 0))
		       (waiting-for-memory)	;---want to use this somehow...
    ((1 2)	;;1 x*y logand		;;2 ~x:y logand
     (parallel
      (declare-memory-timing data-cycle)
      (abus-array-data (store-word (logand bb-s-word memory-data)))
      (return)))
4,887,235
	309	310
    ((4 8.)	;;4 x*~y andcb		;;8 ~(x+y) ~x*~y andcb
     (parallel
      (declare-memory-timing data-cycle)
      (abus-array-data (store-word (anac2 bb-s-word memory-data)))
      (return)))
    ((6 9.)	;;6 x#y logxor		;;9 ~(x#y)=~x#ylogxor
     (parallel
      (declare-memory-timing data-cycle)
      (abus-array-data (store-word (logxor bb-s-word memory-data)))
      (return)))
    ((7 11.)	;;7 x+y logior		;;1l ~x+y logior
     (parallel
      (declare-memory-timing data-cycle)
      (abus-array-data (store-word (logior bb-s-word memory-data)))
      (return)))
    ((13. 14.) ;;13 x+~y = ~(~x*y)	;;14 ~x+~y=~(x*y)
     (parallel
      (declare-memory-timing data-cycle)
      (abus-array-data (store-word (lognand bb-s-word memory-data)))
      (return)))))

;;alu depends only on source bits
(defucode ubitblt-long-row-source
  (parallel
   (assign b-temp bb-d-bitpos)
   (if (zero-fixnum bb-d-bitpos)
       (if (zero-fixnum bb-s-bitpos)
	   (goto ubitblt-aligned-row-source)
	   ;;             SSSSSSSSSSSSSSSSSSSSSSSSSsssssss
	   ;;      dddddddddddddddddddddddddddddddd
	 (parallel-with-s-access bb-s-offset
	   (assign byte-r (32- bb-s-bitpos))
	   (parallel
	    (assign bb-s-word2 (logxor bb-constant (rotate memory-data byte-r)))
	    (lisp (trace-path #/c))
	    (jump ubitblt-d-aligned-row-source))))
     (if (equal-fixnum b-temp bb-s-bitpos)
	   ;;SSSSSSSSSSSSSSSSSSSSSSSSSsssssss
	   ;;DDDDDDDDDDDDDDDDDDDDDDDDDddddddd
	 (sequential
	  (parallel-with-s-access bb-s-offset
	    (assign b-temp (32- bt-d-bitpos))
	    (assign byte-r b-temp)
	    (assign bb-s-word (logxor bb-constant (rotate memory-data tyte-r))))
	  (parallel-with-d-access bb-d-offset
	    (assign byte-r bb-d-bitpos)
	    (assign byte-s (1- t-temp))
	    (store-word (dpb bb-s-word byte-s byte-r memory-data)))
	  ;;First partial word done, we are now the aligned case
	  (incr-wrap-s-offset)
	  (incr-d-offset)
	  (assign bb-width (- bb-width b-temp))
	  (assign bb-s-bitpos (b-constant 0))
	  (parallel
	    (assign bb-d-bitpos (b-constant 0))
	    (lisp (trace-path #/b))
	    (jump ubitbit-aligned-row-source)))
       (if (lesser-fixnum bb-s-bitpos b-temp)
	   ;;sssssssssSSSSSSSSSSSSSSS........
	   ;;         DDDDDDDDDDDDDDDdddddddddddddddd
	   ;;         <- 32-d.bitpos ->
	   (sequential
	    (parallel-with-s-access bb-s-offset
	      (assign byte-r (32- bb-s-bitpos))
	      (assign b-temp (32- bb-d-bitpos))
	      (assign bb-s-word (logxor bb-constant (rotate memory-data byte-r))))
            ;;.......sssssssssSSSSSSSSSSSSSSSS
	    (parallel-with-d-access bb-d-offset
	      (assign byte-r bb-d-bitpos)
	      (assign byte-s (1- b-temp))
	      (store-word (dpb bb-s-word byte-s byte-r memory-data)))
	    ;; First partial B word done, some S bits from first word remain
	    (incr-d-offset)
	    ;;rotate s-word further to right by 32-d.bitpos = left by -(32-d.bitpos)
	    ;;SSSSSSSSSSSSSSSS.......sssssssss
	    (assign bb-s-word2 (rotate bb-s-word byte-r))
	    (assign bb-s-bitpos (+ bb-s-bitpos b-temp))
	    (assign bb-width (- bb-width b-temp))
	    (parallel
	     (assign bb-d-bitpos (b-constant 0)))
	    (lisp (trace-path #/d))
	    (jump ubitblt-d-aligned-row-source))
	 (sequential
;;The high part of the first source word is not as long as the high part of the
;;first destination word. So extract the useful part of the firs t source word.
;:and deposit into it as much of the cecond source word as needed to fill Out the rest
;;of the first destination word. Then position the rest of the second source word
;;appropriately for the inner loop.
4,887,235
	311	312
;;                                       <- 32-s ->
;;      ................................|SSSSSSSSSSssssssssssssssssssssss
;;                      DDDDDDDDDDDDDDDD DDDDDDDDDDdddddd
(parallel-with-s-access bb-s-offset
  (assign byte-r (32- bb-s-bitpos))
  (assign b-temp-2 bb-s-bitpos)
  (assign bb-s-word (logxor bb-constant (rotate memory-data tyte-r))))
(incr-wrap-s-offset-ahead)
;;	                <----- s-d ----> <- 32-s ->    (32-d)-(32-s)-s-d
;;      ssssssssssssssssSSSSSSSSSSSSSSSS|1111111111......................
;;                      DDDDDDDDDDDDDDDD DDDDDDDDDDdddddd
(parallel-with-s-access bb-s-offset-ahead
  (assign byte-r (32- bb-s-bitpos))
  (assign byte-s (- b-temp-2 bt-d-bitpos 1))
  (assign bb-s-word2 (logxor bb-constant memory-data)))
(assign bb-s-word (dpb bb-s-word2 byte-s byte-r bb-s-word))
(parallel
  (assign a-temp (32- bt-d-bitpos))
  (assign b-temp obus))
  (parallel-with-d-access bb-d-offset
    (assign byte-r bb-d-bitpos)
    (assign byte-s (1- a-temp))
    (store-word (dpb bb-s-word byte-s byte-r memory-data)))
  ;; We have now done the first partial D word. Turn into the d-aligned
  ;; case, with the source advanced by one word from where it started.
  (incr-d-offset)
  (assign bb-s-offset bb-s-offset-ahead)
  (assign bb-s-bitpos (- b-temp-2 bt-d-bitpos))
  (assign byte-r (32- bb-s-bitpos))
  (assign bb-s-word2 (rotate bb-s-word2 byte-r))
  (assign bb-width (- bb-width t-temp))
  (parallel
   (assign bb-d-bitpos (b-constant 0))
   (lisp (trace-patn #/e))
   (jump ubitblt-d-aligned-row-source))))))))

(defucode ubitblt-aligned-row-source	;28 cycles per 8 words
  (if (greater-or-equal-fixnum bb-width (b-constant (* 8. 32.)))
      ;;Fetch a block of words into the buffer
      (sequential
       (assign b-temp (+ bb-s-offset (b-constant 8.)))
       (if (lesser-fixnum bb-s-row-length b-temp)
	   (goto ubitblt-aligned-row-source-slow-loop)
	 (sequential
	  (parallel
	   (assign-vma-offset s)
	   (call ubitblt-block-read-8))
	  (parallel
	   (assign-vma-offset d)
	   (call ubitblt-block-write-8))
	  (parallel
	   (assign bb-s-offset (+ bb-s-offset b-block-size))
	   (jump ubitblt-aligned-row-source)))))
    ;;Frob with whats left. Too bad dispatch blocks are expensive.
    (if (greater-or-equal-fixnum bb-width (b-constant (* 4 32.)))
	(sequential
	 (assign b-temp (+ bb-s-offset (b-constant 4)))
	 (if (lesser-fixnum bb-s-row-length b-temp)
	     (goto ubitblt-aligned-row-source-slow-loop)
	   (sequential
	    (parallel
	     (assign-vma-offset s)
	     (call ubitblt-block-read-4))
	    (parallel
	     (assign-vma-offset d)
	     (call bitblt-block-write-4))
	    (parallel
	     (assign bb-s-offset (+ bb-s-offset b-block-size))
	     (jump ubitblt-aligned-row-source-slow-loop)))))
      (goto ubitblt-aligned-row-source-slow-loop))))

(defucode ubitblt-aligned-row-source-slow-loop		;10 cycles per word
  (parallel-with-s-access bb-s-offset
    (trap-if (lesser-fixnum bb-width (b-constant 32.))
	     ubitblt-aligned-row-source-slow-loop-done)
    (waiting-for-memory)
  (assign bb-s-word (logxor bb-constant memory-data)))
(assign-vma-offset d)					;1
(store-word bb-s-word)					;1
(assign bb-width (- bb-width (b-constant 32.)))		;1
(incr-wrap-s-offset)					;2
(parallel						;1
 (incr-d-offset)
 (lisp (trace-path #/,))
 (jump ubitblt-aligned-row-source-slow-loop)))

;Do last partial word, if any
(defucode ubitblt-aligned-row-source-slow-loop-done
  (if (plus-fixnum bb-width)
      (sequential