solution.cpp

#pragma GCC optimize("Ofast")
#pragma GCC target("avx")

#include <iostream>
#include <iomanip>
#include <algorithm>
#include <assert.h>
#include <immintrin.h>

namespace Solution {

    const int NMAX = 100000;

    const int blockSize = 512;

    const int MaxNBlocks = (NMAX + blockSize - 1) / blockSize;

    float* cntTillBorder[MaxNBlocks];

    float* zeroBuffer;

    float* arr;

    int n, nBlocks;

    void alloc(int n_) {
        n = n_;
        nBlocks = (n + blockSize - 1) / blockSize;
        arr = (float*)_mm_malloc(n * sizeof(float), 32);
        zeroBuffer = (float*)_mm_malloc(n * sizeof(float), 32);
        std::fill(zeroBuffer, zeroBuffer+n, 0);
        for (int i = 0; i < nBlocks; ++i) {
            cntTillBorder[i] = (float*)_mm_malloc(n * sizeof(float), 32);
            std::fill(cntTillBorder[i], cntTillBorder[i]+n, 0);
        }
    }

    void free() {
        _mm_free(arr);
        for (int i = 0; i < nBlocks; ++i) {
            _mm_free(cntTillBorder[i]);
        }
    }

    void precalc() {
        for (int i = 0; i < n; ++i) {
            int value = (int)arr[i];
            for (int b = i / blockSize; b < nBlocks; ++b) {
                cntTillBorder[b][value]++;
            }
        }
    }

    void naiveUpdateItems(int begin, int after, float x, float y) {
        int cnt = 0;
        for (int i = begin; i < after; ++i) {
            cnt += (arr[i] == x);
            arr[i] = (arr[i] == x) ? y : arr[i];
        }
        for (int b = begin / blockSize; b < nBlocks; ++b) {
            cntTillBorder[b][int(x)] -= (float)cnt;
            cntTillBorder[b][int(y)] += (float)cnt;
        }
    }

    void modify(int lt, int rt, float x, float y) {
        int bl = lt / blockSize;
        int br = rt / blockSize;
        if (bl == br) {
            naiveUpdateItems(lt, rt+1, x, y);
            return;
        }
        naiveUpdateItems(lt, (bl+1) * blockSize, x, y);
        naiveUpdateItems(br * blockSize, rt+1, x, y);
        __m256 vx = _mm256_set1_ps(x);
        __m256 vy = _mm256_set1_ps(y);
        int changes = 0;
        for (int b = bl+1; b < br; ++b) {
            cntTillBorder[b][int(x)] -= (float)changes;
            cntTillBorder[b][int(y)] += (float)changes;
            float *blockBegin = arr + b * blockSize;
            for (int i = 0; i < blockSize; i += 32) {
                uint32_t bitmask = 0;
                for (int j = 0; j < 32; j += 8) {
                    __m256 va = _mm256_load_ps(blockBegin + i + j);
                    __m256 rs = _mm256_cmp_ps(vx, va, _CMP_EQ_OQ);
                    bitmask = (bitmask << 8) | _mm256_movemask_ps(rs);
                    _mm256_maskstore_ps(blockBegin + i + j, _mm256_cvtps_epi32(rs), vy);
                }
                changes += __builtin_popcountll(bitmask);
            }
        }
    }

    int nth_element(int lt, int rt, int k) {
        int bl = lt / blockSize;
        int br = rt / blockSize;
        float* arrLT = (bl == 0) ? zeroBuffer : cntTillBorder[bl-1];
        float* arrRT = cntTillBorder[br];
        for (int i = rt+1; i < std::min((br+1) * blockSize, n); ++i) { arrRT[(int)arr[i]]--; }
        for (int i = bl * blockSize; i < lt; ++i) { arrLT[(int)arr[i]]++; }
        int last = -1;
        for (int i = 0; i + 31 < n; i += 32) {
            __m256 sum = _mm256_setzero_ps(), vr, vl;
            for (int j = 0; j < 32; j += 8) {
                vr = _mm256_load_ps(arrRT+i+j);
                vl = _mm256_load_ps(arrLT+i+j);
                sum = _mm256_add_ps(sum, _mm256_sub_ps(vr,vl));
            }
            alignas(32) static float tmp[8];
            _mm256_store_ps(tmp, sum);
            int cnt = 0;
            for (int j = 0; j < 8; ++j) { cnt += (int)tmp[j]; }
            if (cnt >= k) {
                last = i-1;
                break;
            }
            k -= cnt;
            last = i + 31;
        }
        int cnt = 0;
        while (cnt < k) { last++; cnt += int(arrRT[last] - arrLT[last]); }
        for (int i = bl * blockSize; i < lt; ++i) { arrLT[(int)arr[i]]--; }
        for (int i = rt+1; i < std::min((br+1) * blockSize, n); ++i) { arrRT[(int)arr[i]]++; }
        return last;
    }
}

char getChar() {
    static const int SIZE = 1 << 16;
    static char buffer[SIZE];
    static int pos = 0;
    static int size = 0;
    if (pos == size) {
        size = (int)fread(buffer, 1, SIZE, stdin),
        pos = 0;
    }
    if (pos == size) {
        return EOF;
    }
    return buffer[pos++];
}

template<typename T>
T getInt() {
    char c = '?';
    while (!(c == '-' || ('0' <= c && c <= '9'))) { c = getChar(); }
    bool pos = true;
    if (c == '-') { pos = false; c = getChar(); }
    T ret = 0;
    while ('0' <= c && c <= '9') { (ret *= 10) += (c - '0'); c = getChar(); }
    return pos ? ret : -ret;
}

void putChar(char c) {
    static const int SIZE = 1 << 16;
    static char buffer[SIZE];
    static int size = 0;
    if (size == SIZE || c == EOF) {
        fwrite(buffer, 1, size, stdout),
        size = 0;
    }
    if (c != EOF) { buffer[size++] = c; }
}

template<typename T>
void putInt(T value) {
    bool pos = true;
    if (value < 0) { pos = false; value = -value; }
    static char buf[24];
    int size = 0;
    do { buf[size++] = char(value % 10 + '0'); value /= 10; } while (value > 0);
    if (!pos) { buf[size++] = '-'; }
    while (size--) { putChar(buf[size]); }
}

int main() {
    int n = getInt<int>(), q = getInt<int>();
    Solution::alloc(n);
    for (int i = 0; i < n; ++i) {
        Solution::arr[i] = (float)(getInt<int>()-1);
    }
    Solution::precalc();

    while (q--) {
        int t = getInt<int>();
        if (t == 1) {
            int lt = getInt<int>()-1, rt = getInt<int>()-1, x = getInt<int>()-1, y = getInt<int>()-1;
            Solution::modify(lt, rt, (float)x, (float)y);
        } else {
            int lt = getInt<int>()-1, rt = getInt<int>()-1, k = getInt<int>();
            putInt(Solution::nth_element(lt, rt, k)+1);
            putChar('\n');
            assert(t == 2);
        }
    }
    putChar(EOF);
    Solution::free();
    return 0;
}