multi-core submit


        static bool multicoreSubmit = false;
        static uint batchSize = 500;
        if( multicoreSubmit && opaqueCount > batchSize )
        {
            batchSize = min(batchSize, opaqueCount / 2);//todo - some formula that tries to balance the size, instead of letting the last job be smaller

            const DrawItemKey* draw = opaque;
            uint numGroups = (opaqueCount+batchSize-1)/batchSize;
            struct Arg { Atomic* done; const DrawItemKey* begin, *end; GpuContext* ctx; const RenderPass* pass; ClearCommand* clear; };
            GpuContext** contexts = eiAllocArray(a, GpuContext*, numGroups);
            Arg* args = eiAllocArray(a, Arg, numGroups);
            Atomic done;
            {
                eiProfile("AcquireDeferredContexts");//todo - turns out that preparing a deferred context is expensive - worthwhile adding this to the job chain instead of having the main thread do it here
                for( uint group=0; group!=numGroups; ++group )
                {
                    contexts[group] = &m_gpuDevice->AcquireDeferredContext();
                }
            }
            for( uint group=0; group!=numGroups; ++group )
            {
                Arg arg = { &done, draw, draw+batchSize, contexts[group], renderPass, group==0?clearOpaque:0 };
                if( arg.end > opaque + opaqueCount )
                {
                    eiASSERT( group == numGroups-1 );
                    arg.end = opaque + opaqueCount;
                }
                draw = arg.end;
                args[group] = arg;
                JobPool::Job j = { []( void* data, ThreadId& )
                {
                    Arg& arg = *(Arg*)data;
                    arg.ctx->TransferThreadOwnership();
                    arg.ctx->Submit( *arg.pass, DrawList(arg.begin, arg.end), arg.clear );
                    arg.ctx->Finish();
                    ++(*arg.done);
                }, &args[group] };
                GetThreadId()->Jobs().PushJob(j);//todo - push all jobs at once instead of one at a time
            }
            YieldThreadUntil(WaitForValue(done, numGroups));
            m_gpuDevice->Submit(numGroups, contexts); // calls ExecuteCommandLists once with all the results
        }
        else
        {
            ctx.Submit( *renderPass, DrawList(opaqueCount, opaque), clearOpaque );
        }