asm {
//************************************
//See ::/Doc/Credits.DD.
_MALLOC::
// Throws 'OutMem'
        PUSH    RBP
        MOV     RBP,RSP
        PUSH    RSI
        PUSH    RDI

        XOR     RBX,RBX
        MOV     RDX,U64 SF_ARG2[RBP]
        TEST    RDX,RDX
        JNZ     @@05
        MOV     RDX,U64 FS:CTask.addr[RBX]
@@05:   CMP     U32 CTask.task_signature[RDX],TASK_SIGNATURE_VAL

#assert CTask.task_signature==CHeapCtrl.hc_signature //location signature same

        JNE     @@10
        MOV     RDX,U64 CTask.data_heap[RDX]
@@10:   CMP     U32 CHeapCtrl.hc_signature[RDX],HEAP_CTRL_SIGNATURE_VAL
        JE      @@15
        PUSH    RDX
        CALL    &SysBadMAlloc
        JMP     I32 _SYS_HLT

@@15:   MOV     RAX,U64 SF_ARG1[RBP]
        PUSHFD
        ADD     RAX,CMemUsed.start+7    //round-up to I64
        AND     AL,0xF8
#assert CMemUsed.start>=sizeof(CMemUnused)
        CMP     RAX,CMemUsed.start
        JAE     @@20
        MOV     RAX,CMemUsed.start
@@20:

        CLI
@@25:   LOCK
        BTS     U32 CHeapCtrl.locked_flags[RDX],HClf_LOCKED
        PAUSE   //don't know if this inst helps
        JC      @@25

        CMP     RAX,MEM_HEAP_HASH_SIZE
        JAE     @@30
        MOV     RSI,U64 CHeapCtrl.heap_hash[RAX+RDX]
        TEST    RSI,RSI
        JZ      @@35
        MOV     RCX,U64 CMemUnused.next[RSI]
        MOV     U64 CHeapCtrl.heap_hash[RAX+RDX],RCX
        JMP     I32 MALLOC_ALMOST_DONE

//Big allocation
@@30:   ADD     RAX,sizeof(CMemBlk)+MEM_PAG_SIZE-1
        SHR     RAX,MEM_PAG_BITS

        PUSH    RDX //preserve HeapCtrl
        PUSH    RDX
        PUSH    RAX
        CALL    &MemPagTaskAlloc
        POP     RDX
        TEST    RAX,RAX
        JZ      @@45    //Out of memory
        MOV     RSI,RAX
        MOV     EAX,U32 CMemBlk.pags[RSI]

        SHL     RAX,MEM_PAG_BITS
        SUB     RAX,sizeof(CMemBlk)
        ADD     RSI,sizeof(CMemBlk)
        JMP     I32 MALLOC_ALMOST_DONE

//Little allocation, chunk-off piece from free lst chunks
@@35:   LEA     RSI,U64 CHeapCtrl.malloc_free_lst-CMemUnused.next[RDX]

@@40:   MOV     RBX,RSI
        MOV     RSI,U64 CMemUnused.next[RBX]
        TEST    RSI,RSI
        JNZ     I32 @@60
        PUSH    RAX             //-**** save byte size
        ADD     RAX,16*MEM_PAG_SIZE-1
        SHR     RAX,MEM_PAG_BITS

        PUSH    RDX //preserve HeapCtrl
        PUSH    RDX
        PUSH    RAX
        CALL    &MemPagTaskAlloc
        POP     RDX
        TEST    RAX,RAX
        JNZ     @@50

//Out of memory
@@45:   LOCK
        BTR     U32 CHeapCtrl.locked_flags[RDX],HClf_LOCKED
        POPFD
        PUSH    TRUE
        MOV     RAX,'OutMem'
        PUSH    RAX
        CALL    I32 &throw
        JMP     I32 MALLOC_FINAL_EXIT //Never gets here, hopefully.

@@50:   MOV     RSI,RAX
        MOV     EAX,U32 CMemBlk.pags[RSI]
        SHL     RAX,MEM_PAG_BITS

//Can it be combined with last chunk? (Never Free these chunks.)
        MOV     RDI,U64 CHeapCtrl.last_mergable[RDX]
        LEA     RBX,U64 [RSI+RAX]
        CMP     RDI,RBX
        JNE     @@55

        PUSH    RAX
        MOV     EAX,U32 CMemBlk.pags[RDI]
        ADD     U32 CMemBlk.pags[RSI],EAX
//QueRem
        MOV     RAX,U64 CMemBlk.next[RDI]
        MOV     RBX,U64 CMemBlk.last[RDI]
        MOV     U64 CMemBlk.last[RAX],RBX
        MOV     U64 CMemBlk.next[RBX],RAX
        POP     RAX

@@55:   MOV     U64 CHeapCtrl.last_mergable[RDX],RSI
        LEA     RSI,U64 sizeof(CMemBlk)[RSI]
        SUB     RAX,sizeof(CMemBlk)
        LEA     RBX,U64 CHeapCtrl.malloc_free_lst-CMemUnused.next[RDX]
        MOV     RDI,U64 CMemUnused.next[RBX]
        MOV     U64 CMemUnused.next[RSI],RDI
        MOV     U64 CMemUnused.size[RSI],RAX
        MOV     U64 CMemUnused.next[RBX],RSI
        POP     RAX             //+****
        JMP     @@70
@@60:   CMP     U64 CMemUnused.size[RSI],RAX
        JB      I32 @@40
        JNE     @@70

@@65:   MOV     RDI,U64 CMemUnused.next[RSI]
        MOV     U64 CMemUnused.next[RBX],RDI
        JMP     MALLOC_ALMOST_DONE

@@70:   SUB     U64 CMemUnused.size[RSI],RAX    //UPDATE FREE ENTRY
        CMP     U64 CMemUnused.size[RSI],sizeof(CMemUnused)
        JAE     @@75                    //take from top of block
        ADD     U64 CMemUnused.size[RSI],RAX    //doesn't fit, undo
        JMP     I32 @@40

@@75:   ADD     RSI,U64 CMemUnused.size[RSI]

MALLOC_ALMOST_DONE:
//RSI=res-CMemUsed.size
//RAX=size+CMemUsed.size
//RDX=HeapCtrl
        ADD     U64 CHeapCtrl.used_u8s[RDX],RAX

#if _CFG_HEAP_DBG
//QueIns
        MOV     RDI,U64 CHeapCtrl.last_um[RDX]
        MOV     U64 CMemUsed.next[RDI],RSI
        MOV     U64 CHeapCtrl.last_um[RDX],RSI
        MOV     U64 CMemUsed.last[RSI],RDI
        LEA     RDI,U64 CHeapCtrl.next_um-CMemUsed.next[RDX]
        MOV     U64 CMemUsed.next[RSI],RDI

//Caller1/Caller2
        PUSH    RDX
        MOV     RDX,U64 [MEM_HEAP_LIMIT]
        MOV     RDI,U64 SF_RIP[RBP]
        CMP     RDI,RDX
        JB      @@80
        XOR     RDI,RDI
        MOV     U64 CMemUsed.caller1[RSI],RDI
        JMP     @@90
@@80:   MOV     U64 CMemUsed.caller1[RSI],RDI
        MOV     RDI,U64 SF_RBP[RBP]
        CMP     RDI,RDX
        JB      @@85
        XOR     RDI,RDI
        JMP     @@90
@@85:   MOV     RDI,U64 SF_RIP[RDI]
        CMP     RDI,RDX
        JB      @@90
        XOR     RDI,RDI
@@90:   MOV     U64 CMemUsed.caller2[RSI],RDI
        POP     RDX

#endif
        LOCK
        BTR     U32 CHeapCtrl.locked_flags[RDX],HClf_LOCKED
        POPFD

        MOV     U64 CMemUsed.size[RSI],RAX
        MOV     U64 CMemUsed.hc[RSI],RDX
        LEA     RAX,U64 CMemUsed.start[RSI]

        TEST    U8 [SYS_SEMAS+SEMA_HEAPLOG_ACTIVE*DFT_CACHE_LINE_WIDTH],1
        JZ      @@105
        PUSH    RAX
        PUSH    RAX
        MOV     RAX,U64 [SYS_EXTERN_TABLE]
        MOV     RAX,U64 EXT_HEAPLOG_MALLOC*8[RAX]
        TEST    RAX,RAX
        JZ      @@95
        CALL    RAX
        JMP     @@100
@@95:   ADD     RSP,8
@@100:  POP     RAX

@@105:  TEST    U8 [SYS_HEAP_INIT_FLAG],1
        JZ      MALLOC_FINAL_EXIT

        PUSH    RAX
        MOV     RCX,U64 CMemUsed.size-CMemUsed.start[RAX]
        SUB     RCX,CMemUsed.start
        MOV     RDI,RAX
        MOV     AL,U8 [SYS_HEAP_INIT_VAL]
        REP_STOSB
        POP     RAX

MALLOC_FINAL_EXIT:
        POP     RDI
        POP     RSI
        POP     RBP
        RET1    16
//************************************
_FREE::
//Be aware of heap_hash in MemPagTaskAlloc().
        PUSH    RBP
        MOV     RBP,RSP
        PUSH    RSI
        PUSH    RDI

        TEST    U8 [SYS_SEMAS+SEMA_HEAPLOG_ACTIVE*DFT_CACHE_LINE_WIDTH],1
        JZ      @@15
        MOV     RBX,U64 SF_ARG1[RBP]
        TEST    RBX,RBX
        JZ      @@05
        MOV     RAX,U64 CMemUsed.size-CMemUsed.start[RBX]
        TEST    RAX,RAX
        JGE     @@05    //Aligned alloced chunks have neg size
        ADD     RBX,RAX
@@05:   PUSH    RBX
        MOV     RAX,U64 [SYS_EXTERN_TABLE]
        MOV     RAX,U64 EXT_HEAPLOG_FREE*8[RAX]
        TEST    RAX,RAX
        JZ      @@10
        CALL    RAX
        JMP     @@15
@@10:   ADD     RSP,8

@@15:   MOV     RSI,U64 SF_ARG1[RBP]
        TEST    RSI,RSI

#if _CFG_HEAP_DBG
        JZ      I32 FREE_DONE
#else
        JZ      FREE_DONE
#endif

        MOV     RAX,U64 CMemUsed.size-CMemUsed.start[RSI]
        TEST    RAX,RAX
        JGE     @@20    //Aligned alloced chunks have neg size.
                        //The neg size is offset to start of CMemUsed struct.
        ADD     RSI,RAX

@@20:   PUSHFD
        SUB     RSI,CMemUsed.start
        MOV     RDX,U64 CMemUsed.hc[RSI]
        CMP     U32 CHeapCtrl.hc_signature[RDX],HEAP_CTRL_SIGNATURE_VAL
        JE      @@25
        ADD     RSI,CMemUsed.start
        PUSH    RSI
        CALL    &SysBadFree
        JMP     I32 _SYS_HLT

@@25:   MOV     RAX,U64 CMemUsed.size[RSI]
        SUB     U64 CHeapCtrl.used_u8s[RDX],RAX
        CLI
@@30:   LOCK
        BTS     U32 CHeapCtrl.locked_flags[RDX],HClf_LOCKED
        PAUSE
        JC      @@30
#if _CFG_HEAP_DBG
//QueRem
        MOV     RDX,U64 CMemUsed.next[RSI]
        MOV     RDI,U64 CMemUsed.last[RSI]
        MOV     U64 CMemUsed.last[RDX],RDI
        MOV     U64 CMemUsed.next[RDI],RDX

//Caller1/Caller2
        MOV     RDX,U64 [MEM_HEAP_LIMIT]
        MOV     RDI,U64 SF_RIP[RBP]
        CMP     RDI,RDX
        JB      @@35
        XOR     RDI,RDI
        MOV     U64 CMemUnused.caller1[RSI],RDI
        JMP     @@45
@@35:   MOV     U64 CMemUnused.caller1[RSI],RDI
        MOV     RDI,U64 SF_RBP[RBP]
        CMP     RDI,RDX
        JB      @@40
        XOR     RDI,RDI
        JMP     @@45
@@40:   MOV     RDI,U64 SF_RIP[RDI]
        CMP     RDI,RDX
        JB      @@45
        XOR     RDI,RDI
@@45:   MOV     U64 CMemUnused.caller2[RSI],RDI

        MOV     RDX,U64 CMemUsed.hc[RSI]
#endif
        CMP     RAX,MEM_HEAP_HASH_SIZE
        JAE     @@50

#assert CMemUnused.size==CMemUsed.size
//      MOV     U64 CMemUnused.size[RSI],RAX

        MOV     RBX,U64 CHeapCtrl.heap_hash[RAX+RDX]
        MOV     U64 CMemUnused.next[RSI],RBX
        MOV     U64 CHeapCtrl.heap_hash[RAX+RDX],RSI
        JMP     @@55

@@50:   SUB     RSI,sizeof(CMemBlk)
        PUSH    RDX
        PUSH    RDX
        PUSH    RSI
        CALL    &MemPagTaskFree
        POP     RDX

@@55:   LOCK
        BTR     U32 CHeapCtrl.locked_flags[RDX],HClf_LOCKED
        POPFD
FREE_DONE:
        POP     RDI
        POP     RSI
        POP     RBP
        RET1    8
//************************************
_MSIZE::
        PUSH    RBP
        MOV     RBP,RSP
        MOV     RBX,U64 SF_ARG1[RBP]
        XOR     RAX,RAX
        TEST    RBX,RBX
        JZ      @@10
        MOV     RAX,U64 CMemUsed.size-CMemUsed.start[RBX]
        TEST    RAX,RAX
        JGE     @@05    //Aligned alloced chunks have neg size
        ADD     RBX,RAX
        MOV     RAX,U64 CMemUsed.size-CMemUsed.start[RBX]
@@05:   SUB     RAX,CMemUsed.start
@@10:   POP     RBP
        RET1    8
//************************************
_MSIZE2::
        PUSH    RBP
        MOV     RBP,RSP
        MOV     RBX,U64 SF_ARG1[RBP]
        XOR     RAX,RAX
        TEST    RBX,RBX
        JZ      @@10
        MOV     RAX,U64 CMemUsed.size-CMemUsed.start[RBX]
        TEST    RAX,RAX
        JGE     @@05    //Aligned alloced chunks have neg size
        ADD     RBX,RAX
@@05:   MOV     RAX,U64 CMemUsed.size-CMemUsed.start[RBX]
@@10:   POP     RBP
        RET1    8
//************************************
_MHEAP_CTRL::
        PUSH    RBP
        MOV     RBP,RSP
        MOV     RBX,U64 SF_ARG1[RBP]
        XOR     RAX,RAX
        TEST    RBX,RBX
        JZ      @@10
        MOV     RAX,U64 CMemUsed.size-CMemUsed.start[RBX]
        TEST    RAX,RAX
        JGE     @@05    //Aligned alloced chunks have neg size
        ADD     RBX,RAX
@@05:   MOV     RAX,U64 CMemUsed.hc-CMemUsed.start[RBX]
@@10:   POP     RBP
        RET1    8
}

_extern _FREE U0 Free(U8 *addr); //Free MAlloc()ed memory chunk.
_extern _MSIZE I64 MSize(U8 *src); //Size of heap object.
_extern _MSIZE2 I64 MSize2(U8 *src); //Internal size of heap object.
_extern _MHEAP_CTRL CHeapCtrl *MHeapCtrl(U8 *src); //CHeapCtrl of object.
_extern _MALLOC U8 *MAlloc(I64 size,CTask *mem_task=NULL); //Alloc memory chunk.
//Accepts a CTask or CHeapCtrl. NULL allocs off current task's heap.

U8 *AMAlloc(I64 size)
{//Alloc memory in Adam's heap.
  return MAlloc(size,adam_task);
}

U8 *CAlloc(I64 size,CTask *mem_task=NULL)
{//Accepts a CTask or CHeapCtrl.  NULL allocs off current task's heap.
  U8 *res=MAlloc(size,mem_task);
  MemSet(res,0,size);
  return res;
}

U8 *ACAlloc(I64 size)
{//Alloc and set to zero memory in Adam's heap.
  return CAlloc(size,adam_task);
}

U8 *MAllocIdent(U8 *src,CTask *mem_task=NULL)
{//Accepts a CTask or CHeapCtrl.  NULL allocs off current task's heap.
  U8 *res;
  I64 size;
  if (!src) return NULL;
  size=MSize(src);
  res=MAlloc(size,mem_task);
  MemCpy(res,src,size);
  return res;
}

U8 *AMAllocIdent(U8 *src)
{//Alloc in Adam's heap, ident copy of heap node.
  return MAllocIdent(src,adam_task);
}

U8 *MAllocAligned(I64 size,I64 alignment,
        CTask *mem_task=NULL,I64 misalignment=0)
{//Only powers of two alignment. This is awful.
  I64 mask=alignment-1;
  U8 *ptr=MAlloc(size+mask+sizeof(I64)+misalignment,mem_task),
        *res=(ptr+sizeof(I64)+mask)&~mask+misalignment;
  res(I64 *)[-1]=ptr-res;
#assert offset(CMemUsed.size)==offset(CMemUsed.start)-sizeof(I64)
  return res;
}

U8 *CAllocAligned(I64 size,I64 alignment,
        CTask *mem_task=NULL,I64 misalignment=0)
{//Only powers of two alignment. This is awful.
  I64 mask=alignment-1;
  U8 *ptr=MAlloc(size+mask+sizeof(I64)+misalignment,mem_task),
        *res=(ptr+sizeof(I64)+mask)&~mask+misalignment;
  res(I64 *)[-1]=ptr-res;
#assert offset(CMemUsed.size)==offset(CMemUsed.start)-sizeof(I64)
  MemSet(res,0,size);
  return res;
}

U8 *StrNew(U8 *buf,CTask *mem_task=NULL)
{//Accepts a CTask or CHeapCtrl.  NULL allocs off current task's heap.
  U8 *res;
  I64 size;
  if (buf) {
    size=StrLen(buf)+1;
    res=MAlloc(size,mem_task);
    MemCpy(res,buf,size);
  } else {
    res=MAlloc(1,mem_task);
    *res=0;
  }
  return res;
}

U8 *AStrNew(U8 *buf)
{//Alloc copy of string in Adam's heap.
  return StrNew(buf,adam_task);
}