TLE 了,好像是板子比较慢,但是本地开 O2 是可以过的,请问板子有什么可以优化的地方吗。
#include <bits/stdc++.h>
#define fr first
#define se second
#define Un unsigned
#define LL long long
#define pb push_back
#define pii pair<int,int>
#define pLi pair<LL,int>
#define pLL pair<LL,LL>
#define __ __int128
#define LD long double
#define VE vector<LL>
#define DB double
#define Ve vector<int>
using namespace std;
inline LL read()
{
LL x = 0,f = 1;char ch = getchar();
while(!isdigit(ch)) (ch == '-') && (f = -1),ch = getchar();
while(isdigit(ch)) x = x*10+ch-48,ch = getchar();
return x*f;
}
VE f,g;
namespace Poly
{
const int N = 4e5+5,P = 998244353;
inline int mod(int x){return x >= P ? x-P : x < 0 ? x+P : x;}
LL qp(LL x,int y)
{
LL res = 1;
while (y)
{
if(y&1) res = res*x%P;
x = x*x%P,y >>= 1;
}
return res;
}
LL inv2 = qp(2,P-2),inv3 = qp(3,P-2);
int to[N];
void NTT(VE &f,int n,bool fl)
{
for (int i = 0;i < n;i++)
{
to[i] = (to[i>>1]>>1)|(i&1 ? n>>1 : 0);
if (i < to[i]) swap(f[i],f[to[i]]);
}
for (int len = 2;len <= n;len <<= 1)
{
LL wk = qp(fl ? 3 : inv3,(P-1)/len);
for (int l = 0;l < n;l += len)
{
LL w = 1;
for (int i = l;i < l+len/2;i++)
{
LL tmp = w*f[i+len/2]%P;
f[i+len/2] = mod(f[i]-tmp),f[i] = mod(f[i]+tmp),w = w*wk%P;
}
}
}
if(!fl)
{
LL Inv = qp(n,P-2);
for (int i = 0;i < n;i++) f[i] = f[i]*Inv%P;
}
}
VE operator*(VE A,VE B)
{
if (A.empty() || B.empty()) return VE();
int n = A.size()-1,m = B.size()-1,t = 1;
while(t <= n+m) t <<= 1;
A.resize(t),B.resize(t);
NTT(A,t,1),NTT(B,t,1);
for (int i = 0;i < t;i++) A[i] = (A[i]*B[i])%P;
NTT(A,t,0),A.resize(n+m+1);
return A;
}
VE operator+(VE A,VE B)
{
int siz = max(A.size(),B.size());
A.resize(siz),B.resize(siz);
for (int i = 0;i < siz;i++) A[i] = mod(A[i]+B[i]);
return A;
}
};
using namespace Poly;
struct squ
{
VE a[2][2];
squ operator*(const squ &b)const
{
squ c;
c.a[0][0] = a[0][0]*b.a[0][0]+a[0][1]*b.a[1][0];
c.a[0][1] = a[0][0]*b.a[0][1]+a[0][1]*b.a[1][1];
c.a[1][0] = a[1][0]*b.a[0][0]+a[1][1]*b.a[1][0];
c.a[1][1] = a[1][0]*b.a[0][1]+a[1][1]*b.a[1][1];
return c;
}
};
int n,k;
squ qp(squ x,int y)
{
squ res = x;y--;
while (y)
{
if (y&1) res = res*x;
x = x*x,y >>= 1;
x.a[0][0].resize(k+1),x.a[0][1].resize(k+1),x.a[1][0].resize(k+1),x.a[1][1].resize(k+1);
res.a[0][0].resize(k+1),res.a[0][1].resize(k+1),res.a[1][0].resize(k+1),res.a[1][1].resize(k+1);
}
return res;
}
int main()
{
squ A,B;
A.a[0][0].pb(1),A.a[0][1].pb(1),A.a[0][1].pb(1);
B.a[0][1].pb(0),B.a[0][1].pb(1),B.a[1][0].pb(1),B.a[1][1].pb(1),B.a[1][1].pb(1);
n = read(),k = read();
A = A*qp(B,n);
for (int i = 1;i <= k;i++) printf("%lld ",A.a[0][0][i]);
return 0;
}