asm {
        ALIGN   16,OC_NOP
USE16
//See TempleOS MultiCore.

//This code gets copied to MP_VECT_ADDR.
//See MemCpy(MP_VECT_ADDR.
COREAP_16BIT_INIT::
        JMP     @@05

        ALIGN   4,OC_NOP
AP_GDT_PTR:     DU16    sizeof(CGDT)-1;
                DU64    0;

@@05:   CLI
        WBINVD
        MOV     AX,MP_VECT_ADDR/16
        MOV     DS,AX
        LGDT    U32 [CAP16BitInit.ap_gdt_ptr]  //See mp->ap_gdt_ptr

        MOV     EAX,SYS_START_CR0
        MOV_CR0_EAX
        DU8     0x66,0xEA;               //JMP CGDT.cs32:AP_32BIT_INIT
        DU32    AP_32BIT_INIT;
        DU16    CGDT.cs32;
COREAP_16BIT_INIT_END::

USE32
AP_32BIT_INIT:
        MOV     AX,CGDT.ds
        MOV     DS,AX
        MOV     ES,AX
        MOV     FS,AX
        MOV     GS,AX
        MOV     SS,AX

@@05:   LOCK
        BTS     U32 [SYS_MP_CNT_LOCK],0
        JC      @@05

        MOV     ESI,U32 [SYS_MP_CNT_INITIAL]
        LOCK
        INC     U32 [SYS_MP_CNT_INITIAL]
        LOCK
        BTR     U32 [SYS_MP_CNT_LOCK],0

        CMP     ESI,MP_PROCESSORS_NUM
        JAE     I32 _SYS_HLT

        IMUL2   ESI,sizeof(CCPU)
        ADD     ESI,U32 [SYS_CPU_STRUCTS]

        LEA     ESP,U32 CCPU.start_stk+sizeof(CCPU.start_stk)[ESI]
        PUSH    U32 RFLAGG_START
        POPFD
        PUSH    U32 0   //Return from next call will be 64-bit
        CALL    SYS_ENTER_LONG_MODE
USE64
        FNINIT
        MOV     RAX,RSI
        CALL    SET_GS_BASE
@@10:   MOV     RAX,U64 CCPU.seth_task[RSI]
        TEST    RAX,RAX
        JZ      @@10
        MOV     U64 CTask.gs[RAX],RSI
        CALL    SET_FS_BASE

        JMP     I32 _TASK_CONTEXT_RESTORE
}

U0 TSSBusy(I64 tr,Bool val=OFF)
{//See ::/Demo/Lectures/Ring3.HC.
  LBEqu((&sys_gdt)(U8 *)+tr+4,9,val);
}

CTSS *TSSNew(I64 cpu_num)
{
  U32 *d,*d1;
  CTSS *tss=CAlloc(sizeof(CTSS));
  tss->io_map_offset=offset(CTSS.io_map);
  MemSet(tss->io_map,0xFF,0x10000/8);

  tss->st0=MAlloc(MEM_INTERRUPT_STK);
  tss->rsp0=tss->st0(U8 *)+MSize(tss->st0);
  tss->st1=MAlloc(MEM_INTERRUPT_STK);
  tss->rsp1=tss->st1(U8 *)+MSize(tss->st1);
  tss->st2=MAlloc(MEM_INTERRUPT_STK);
  tss->rsp2=tss->st2(U8 *)+MSize(tss->st2);

  tss->tr      =offset(CGDT.tr)+cpu_num*16;
  tss->tr_ring3=offset(CGDT.tr_ring3)+cpu_num*16;

  d=(&sys_gdt)(U8 *)+tss->tr;
  d1=d(U8 *)+4;
  *d =0x0000FFFF;
  *d1=0x008F8900;
  d(U8 *)+=2;
  *d|=tss & 0x00FFFFFF;
  *d1++|=tss & 0xFF000000;
  *d1++=tss>>32;
  *d1=0;

  d=(&sys_gdt)(U8 *)+tss->tr_ring3;
  d1=d(U8 *)+4;
  *d =0x0000FFFF;
  *d1=0x008FE900;
  d(U8 *)+=2;
  *d|=tss & 0x00FFFFFF;
  *d1++|=tss & 0xFF000000;
  *d1++=tss>>32;
  *d1=0;

  return tss;
}

CCPU *CPUStructInit(I64 num,CCPU *c,CTask *seth_task)
{//Seth is null when called by adam on CSysFixedArea.boot_cpu0
  MemSet(c,0,sizeof(CCPU));
  c->addr=c;
  c->num=num;
  c->idle_factor=0.01;
  QueInit(&c->next_dying);
  if (Bt(&sys_run_level,RLf_16MEG_ADAM_HEAP_CTRL)) {
    c->idle_task=Spawn(0,NULL,"Idle Task",,Fs,,0);
    LBts(&c->idle_task->task_flags,TASKf_IDLE);
    c->tss=TSSNew(num);
  }
  c->seth_task=seth_task;// It waits for this to be filled-in: seth_task
  return c;
}

U0 MPInt(U8 num,I64 cpu_num=1)
{//Generate interrupt for specified core.
  if (cpu_num>=mp_cnt) {
    if (!Bt(&sys_run_level,RLf_MP))
      return;
    else
      throw('MultCore');
  }
  PUSHFD
  CLI //Multitasking safe because each core has a local apic and IRQ's are off
  while (*(dev.uncached_alias+LAPIC_ICR_LOW)(U32 *)&0x1000)
    PAUSE
  *(dev.uncached_alias+LAPIC_ICR_HIGH)(U32 *)=dev.mp_apic_ids[cpu_num]<<24;
  *(dev.uncached_alias+LAPIC_ICR_LOW)(U32 *)=0x4000+num;
  POPFD
}

U0 MPIntAll(U8 num)
{//Generate interrupt for all but own core.
  PUSHFD
  CLI //Multitasking safe because each core has a local apic and IRQ's are off
  while (*(dev.uncached_alias+LAPIC_ICR_LOW)(U32 *)&0x1000)
    PAUSE
  *(dev.uncached_alias+LAPIC_ICR_LOW)(U32 *)=0xC4800+num;
  POPFD
}

U0 MPNMInt()
{//Generate nonmaskable interrupt.
  *(dev.uncached_alias+LAPIC_ICR_LOW)(U32 *)=0xC4400;
}

U0 MPHalt()
{//Halt all other cores.
  mp_cnt=1;
  MPNMInt;
  Busy(10000);
}

U0 MPAPICInit()
{//Called by adam during start-up
//and other cores during initialization
  //after Core0StartMP().
  *(dev.uncached_alias+LAPIC_SVR)(U32 *)|=LAPICF_APIC_ENABLED;
  dev.mp_apic_ids[Gs->num]=*(dev.uncached_alias+LAPIC_APIC_ID)(U32 *)>>24;
  *(dev.uncached_alias+LAPIC_LDR)(U32 *)=dev.mp_apic_ids[Gs->num]<<24;
  *(dev.uncached_alias+LAPIC_DFR)(U32 *)=0xF0000000;

  //  MemSet(dev.uncached_alias+LAPIC_IRR,0,0x20);
  //  MemSet(dev.uncached_alias+LAPIC_ISR,0,0x20);
  //  MemSet(dev.uncached_alias+LAPIC_TMR,0,0x20);

  SetRAX(Gs->tss->tr);
  LTR   AX
  if (Gs->num) {
    IntInit1;
    SetRFlags(RFLAGG_NORMAL);
  }
}

#assert !offset(CJobCtrl.next_waiting)

U0 CoreAPSethTask()
{
  CJobCtrl *ctrl=&Fs->srv_ctrl;
  while (TRUE) {
    STI
    do {
      TaskKillDying;
      do PAUSE
      while (LBts(&ctrl->flags,JOBCf_LOCKED));
    } while (ctrl->next_waiting!=ctrl && JobRunOne(GetRFlags,ctrl));
    CLI
    LBts(&Fs->task_flags,TASKf_AWAITING_MSG);
    LBtr(&ctrl->flags,JOBCf_LOCKED);
    LBts(&Fs->task_flags,TASKf_IDLE);
    Yield;
    LBtr(&Fs->task_flags,TASKf_IDLE);
  }
}

CJob *JobQue(I64 (*fp_addr)(U8 *data),U8 *data=NULL,
       I64 target_cpu=1,I64 flags=1<<JOBf_FREE_ON_COMPLETE,
       I64 job_code=JOBT_CALL,U8 *aux_str=NULL,I64 aux1=0,I64 aux2=0)
{//Queue multicore jobs, handled by Seth tasks.
//Set flags to zero if you wish to get the res.
  //See ::/Demo/MultiCore/Lock.HC
  CJobCtrl *ctrl;
  CJob *tmpc;
  CTask *seth;
  if (!(0<=target_cpu<mp_cnt))
    throw('MultCore');
  tmpc=ACAlloc(sizeof(CJob));
  if (aux_str)
    tmpc->aux_str=AStrNew(aux_str);
  tmpc->job_code=job_code;
  tmpc->addr=fp_addr;
  tmpc->fun_arg=data;
  tmpc->flags=flags;
  tmpc->aux1=aux1;
  tmpc->aux2=aux2;
  seth=cpu_structs[target_cpu].seth_task;
  tmpc->ctrl=ctrl=&seth->srv_ctrl;
  PUSHFD
  CLI
  while (LBts(&ctrl->flags,JOBCf_LOCKED))
    Yield;
  if (ctrl->next_waiting==ctrl && LBtr(&seth->task_flags,TASKf_AWAITING_MSG))
    MPInt(I_WAKE,target_cpu);
  QueIns(tmpc,ctrl->last_waiting);
  LBtr(&ctrl->flags,JOBCf_LOCKED);
  POPFD
  return tmpc;
}

CTask *SpawnQue(U0 (*fp_addr)(U8 *data),U8 *data=NULL,U8 *task_name=NULL,
        I64 target_cpu, CTask *parent=NULL, //NULL means adam
        I64 stk_size=0,I64 flags=1<<JOBf_ADD_TO_QUE)
{
  CTask *res;
  CJob *tmpc=JobQue(fp_addr,data,target_cpu,
        flags,JOBT_SPAWN_TASK,task_name,parent,stk_size);
  CJobCtrl *ctrl;

  while (!Bt(&tmpc->flags,JOBf_DONE)) {
    LBts(&Fs->task_flags,TASKf_IDLE);
    Yield;
  }
  LBtr(&Fs->task_flags,TASKf_IDLE);

  res=tmpc->spawned_task;
  ctrl=tmpc->ctrl;
  PUSHFD
  CLI
  while (LBts(&ctrl->flags,JOBCf_LOCKED))
    Yield;
  QueRem(tmpc);
  LBtr(&ctrl->flags,JOBCf_LOCKED);
  POPFD
  JobDel(tmpc);
  return res;
}

U0 CoreAPSethInit()
{//Called by multicore's seth task after Core0StartMP()
//as the first thing a CPU does before waiting for jobs.
  MPAPICInit;
  Fs->rip=&CoreAPSethTask;
  TaskContextRestore;
}

U0 Core0StartMP()
{//Called by adam during start-up.
  CTask *task;
  U8 buf[STR_LEN];
  CAP16BitInit *mp=MP_VECT_ADDR;
  CCPU *c;
  I64 i,my_mp_cnt;
  CRAXRBCRCXRDX ee;

  CPUId(0x1,&ee);
  if (!Bt(&ee.rdx,9))
    return;

  PUSHFD
  CLI
  if (mp_cnt>1) {
    my_mp_cnt=mp_cnt;
    MPHalt; //sets mp_cnt to 1
    for (i=1;i<my_mp_cnt;i++) {
      c=&cpu_structs[i];
      JobQueDel(&c->seth_task->srv_ctrl.next_waiting);
      JobQueDel(&c->seth_task->srv_ctrl.next_done);
    }
  }
  MemSet(&cpu_structs[1],0,sizeof(CCPU)*(MP_PROCESSORS_NUM-1));

  //When you start-up other cores, they jump to an addr
  //specified by a byte vect number, MPN_VECT which corresponds
  //to a location 4096*vect number, MP_VECT_ADDR.
  MemCpy(mp,COREAP_16BIT_INIT,COREAP_16BIT_INIT_END-COREAP_16BIT_INIT);
  MemCpy(&mp->ap_gdt_ptr,SYS_GDT_PTR,sizeof(CSysLimitBase));
  mp_cnt_initial=mp_cnt=1;
  mp_cnt_lock=0;

  *(dev.uncached_alias+LAPIC_LVT_ERR)(U32 *)=
        *(dev.uncached_alias+LAPIC_LVT_ERR)(U32 *)&0xFFFFFF00+MPN_VECT;
  WBINVD //Not sure why this is needed. Might just need delay. MemCpy above?

  *(dev.uncached_alias+LAPIC_ICR_LOW)(U32 *)=0xC4500; //assert init IPI
  Busy(10000);

  *(dev.uncached_alias+LAPIC_ICR_LOW)(U32 *)=0xC4600+MPN_VECT; //start-up
  Busy(200);
  *(dev.uncached_alias+LAPIC_ICR_LOW)(U32 *)=0xC4600+MPN_VECT;

  Busy(100000);
  for (i=0;i<10000;i++)
    LBts(&mp_cnt_lock,0); //Don't let more through
  my_mp_cnt=mp_cnt_initial;

  if (my_mp_cnt>MP_PROCESSORS_NUM)
    my_mp_cnt=MP_PROCESSORS_NUM;

  for (i=1;i<my_mp_cnt;i++) {
    StrPrint(buf,"Seth Task CPU%02X",i);
    task=Spawn(&CoreAPSethInit,NULL,buf,,,MEM_SETH_STK,0);
    task->rflags=RFLAGG_START;
//CTask alloced off this core's seth_task's heap (Which is Adam)
    CPUStructInit(i,&cpu_structs[i],task);
    WBINVD //Not sure why this is needed.  Might just need delay.
  }

  //Make sure they're all up-and-running
  for (i=1;i<my_mp_cnt;i++)
    while (!Bt(&cpu_structs[i].seth_task->task_flags,TASKf_AWAITING_MSG))
      PAUSE;

  POPFD
  mp_cnt=my_mp_cnt; //Finalize cnt
}

U0 Core0Init()
{//Called by adam during start-up
  CRAXRBCRCXRDX ee;
  CPUId(0x1,&ee);

  mp_cnt_initial=mp_cnt=1;
  mp_cnt_lock=0;

  dbg.mp_crash=ACAlloc(sizeof(CMPCrash));

  //Must be in code heap because init code uses 32 bit addr of cpu_struct
  adam_task->gs=cpu_structs=
        CAlloc(sizeof(CCPU)*MP_PROCESSORS_NUM,Fs->code_heap);
  CPUStructInit(0,cpu_structs,adam_task);
  asm {//RAX has GS
    IMPORT SET_GS_BASE;
    CALL SET_GS_BASE
  }
  if (Bt(&ee.rdx,9)) {
//Unnecessary?
    //    SetMSR(IA32_LAPIC_BASE,dev.uncached_alias+LAPIC_BASE+0x900);
    MPAPICInit;
  }
}

interrupt U0 IntMPCrash()
{//Entering the debugger from another core causes an interrupt on Core0
//Which calls this routine.
  *(dev.uncached_alias+LAPIC_EOI)(U32 *)=0;
  mp_cnt=1;
  Raw(ON);
  text.raw_flags|=RWF_SHOW_DOLLAR;
  "MP Crash CPU%02X Task:%08X\n"
  "RIP:%P\n",dbg.mp_crash->cpu_num,dbg.mp_crash->task,dbg.mp_crash->rip;
  Panic(dbg.mp_crash->msg,dbg.mp_crash->msg_num);
}