代码样例分别输出4,36,0
#include <bits/stdc++.h>
#include <algorithm>
#include <bitset>
#include <cctype>
#include <cerrno>
#include <clocale>
#include <cmath>
#include <complex>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <ctime>
#include <deque>
#include <exception>
#include <fstream>
#include <functional>
#include <limits>
#include <list>
#include <map>
#define int long long
#include <iomanip>
#include <ios>
#include <iosfwd>
#include <iostream>
#include <istream>
#include <ostream>
#include <queue>
#include <set>
#include <sstream>
#include <stack>
#include <stdexcept>
#include <streambuf>
#include <cstring>
#include <utility>
#include <vector>
#include <cwchar>
#include <cwctype>
#include <complex.h>
#include <fenv.h>
#include <inttypes.h>
#include <stdbool.h>
#include <stdint.h>
#include <tgmath.h>
using namespace std;
const int N=1e6+10;
const int M=1e6+10;
const int L=5e3+10;
int f[L][L];
int dp[L];
int l[N];
int sum;
int p;
int s[L];
int fac[L];
int A[N];
void getfac(int n){
fac[0]=fac[1]=1;
for(int i=2;i<=n;i++) fac[i]=fac[i-1]*i%p;
}
void get_f(int n,int m){
f[0][0]=1;
for(int i=1;i<=n;i++){
for(int j=1;j<=m;j++){
f[i][j]=f[i-1][j-1]+f[i-1][j]*(j-1)%p;
f[i][j]%=p;
}
}
}
void get_dpn_and_sumn_and_s(int n){
for(int i=1;i<=n;i++){
if(s[l[i]]==0){
for(int j=1;j<=l[i];j++){
s[l[i]]+=f[l[i]][j]*A[j]%p;
s[l[i]]%=p;
}
}
}
for(int i=1;i<=n;i++){
for(int j=1;j<=l[i];j++){
dp[j]=f[l[i]][j]*((s[l[i-1]]-dp[j]*fac[j]%p+p)%p)%p;
}
cout<<'\n';
}
for(int i=1;i<=l[n];i++){
sum+=dp[i]*A[i]%p;
sum%=p;
}
}
void getA(int m){
A[0]=1;
for(int i=1;i<=m;i++){
A[i]=A[i-1]*(m-i+1)%p;
}
}
int n,m;
int maxx;
signed main(){
ios::sync_with_stdio(false);
cin.tie(0);
cout.tie(0);
cin>>n>>m>>p;
for(int i=1;i<=n;i++) {cin>>l[i]; maxx=max(maxx,l[i]);}
getA(m);
getfac(maxx);
get_f(maxx,maxx);
get_dpn_and_sumn_and_s(n);
cout<<sum;
return 0;
}