Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- //#pragma GCC optimize ("O3")
- //#pragma GCC optimize ("unroll-loops")
- #pragma GCC target("sse,sse2,sse3,ssse3,sse4.1,sse4.2,sse4a,avx,avx2")
- #include <stdio.h>
- #include <cstdlib>
- #include <iostream>
- #include <vector>
- #include <random>
- #include <chrono>
- #include <immintrin.h>
- const int NMAX = 1024;
- alignas(32) int A[NMAX * NMAX], B[NMAX * NMAX], R[NMAX * NMAX], C[NMAX * NMAX];
- char getChar() {
- static char buffer[1 << 20];
- static int size = 0;
- static int pos = 0;
- if (pos == size) {
- size = (int)fread(buffer, 1, 1 << 20, stdin), pos = 0;
- }
- return pos == size ? -1 : buffer[pos++];
- }
- int read() {
- char c = '?';
- while (!(c == '-' || ('0' <= c && c <= '9'))) { c = getChar(); }
- bool neg = false;
- if (c == '-') { neg = true; c = getChar(); }
- int ret = 0;
- while ('0' <= c && c <= '9') { (ret *= 10) += (c - '0'); c = getChar(); }
- return neg ? -ret : ret;
- }
- const int mod = (int)1e9 + 7;
- int add(int a, int b) {
- return (a += b) >= mod ? a - mod : a;
- }
- int mul(int a, int b) {
- return int(1LL * a * b % mod);
- }
- void rand_mat(int *m, int n) {
- uint64_t seed = std::chrono::high_resolution_clock::now().time_since_epoch().count();
- seed ^= (uint64_t)(new uint64_t);
- std::mt19937 gen(seed);
- std::uniform_int_distribution<int> dist(0, mod - 1);
- for (int i = 0; i < n; ++i) {
- for (int j = 0; j < n; ++j) {
- m[i*NMAX + j] = dist(gen);
- }
- }
- }
- void input(int& n) {
- n = read();
- for (int i = 0; i < n; ++i) {
- for (int j = 0; j < n; ++j) {
- A[i*NMAX + j] = read();
- }
- }
- for (int i = 0; i < n; ++i) {
- for (int j = 0; j < n; ++j) {
- B[j*NMAX + i] = read();
- }
- }
- for (int i = 0; i < n; ++i) {
- for (int j = 0; j < n; ++j) {
- C[i*NMAX + j] = read();
- }
- }
- }
- __m256i _mm256_load_pi32x4(const int *x) {
- return _mm256_cvtepi32_epi64(_mm_load_si128(reinterpret_cast<const __m128i*>(x)));
- }
- void _mm256_store_pi64(long long * x, __m256i r) {
- return _mm256_store_si256(reinterpret_cast<__m256i*>(x), r);
- }
- int dot(int n, const int *a, const int *b) {
- const int gsize = 16;
- uint64_t high = 0, low = 0;
- for (int g = 0; g + gsize <= n; g += gsize) {
- alignas(64) long long temp[4];
- __m256i r1, r2, r3, r4;
- r1 = _mm256_mul_epi32(
- _mm256_load_pi32x4((int*)(a + g + 0)),
- _mm256_load_pi32x4((int*)(b + g + 0)));
- r2 = _mm256_mul_epi32(
- _mm256_load_pi32x4((int*)(a + g + 4)),
- _mm256_load_pi32x4((int*)(b + g + 4)));
- r3 = _mm256_mul_epi32(
- _mm256_load_pi32x4((int*)(a + g + 8)),
- _mm256_load_pi32x4((int*)(b + g + 8)));
- r4 = _mm256_mul_epi32(
- _mm256_load_pi32x4((int*)(a + g + 12)),
- _mm256_load_pi32x4((int*)(b + g + 12)));
- _mm256_store_pi64(temp, _mm256_add_epi64(
- _mm256_add_epi64(r1, r2), _mm256_add_epi64(r3, r4)
- ));
- const auto old = low;
- low += temp[0] + temp[1];
- low += temp[2] + temp[3];
- high += (old > low);
- }
- uint64_t res = (low % mod + (((high << 32) % mod) << 32)) % mod;
- for (int i = n / gsize * gsize; i < n; ++i) {
- res += 1LL * a[i] * b[i];
- }
- return res % mod;
- }
- void mult(int n) {
- for (int i = 0; i < n; ++i) {
- for (int j = 0; j < n; ++j) {
- R[i*NMAX + j] = dot(n, &A[i*NMAX], &B[j*NMAX]);
- }
- }
- }
- bool check(const int n) {
- for (int i = 0; i < n; ++i) {
- for (int j = 0; j < n; ++j) {
- if (C[i * NMAX + j] != R[i * NMAX + j]) {
- return false;
- }
- }
- }
- return true;
- }
- void gen(int n) {
- rand_mat(A, n), rand_mat(B, n);
- }
- void test(int n) {
- gen(n);
- mult(n);
- for (int i = 0; i < n; ++i) {
- for (int j = 0; j < n; ++j) {
- int res = 0;
- for (int k = 0; k < n; ++k) {
- res = add(res, mul(A[i*NMAX + k], B[j*NMAX + k]));
- }
- C[i*NMAX + j] = res;
- }
- }
- std::cout << (check(n) ? "YES" : "NO") << std::endl;
- std::exit(0);
- }
- int main() {
- //test(100);
- int n;
- input(n);
- mult(n);
- std::cout << (check(n) ? "YES" : "NO") << std::endl;
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement