我用如下c++代码抓取网页时返回400 bad requests无法解决,dao们帮忙看看哪里出了问题
#include<bits/stdc++.h>
#include<Winsock2.h>
#include<Windows.h>
using namespace std;
#ifndef HTTP_H
#define HTTP_H
class CHttp{
public:
std::string m_host; //域名
std::string m_object; //资源路径
bool m_bHttps;
SOCKET m_socket; //套接字
public:
//构造函数
CHttp(){
m_bHttps = false; //默认一开始不是https协议
m_socket = 0;
}
//析构函数
~CHttp(){
}
//解析URL函数
bool AnalyseURL(std::string url){
//https://www.microsoft.com/zh-cn/download/confirmation.aspx?id=40770 示例 https
//http://www.163.com/ 示例 http
//将字符串分别转化为大、小写的函数
//toupper(); tolower(); 因为有些网站用的是大写的HTTPS\HTTP
std::string str = url.substr(0, 8); //substr(string, start<,length>):从string的start位置开始提取字符串,length:要提取字符串的长度
if ("https://" == str)
{
m_bHttps = true;
}
else if (str.find("http://") !=std::string::npos)
{
m_bHttps = false;
}
else
return false;
//找主机网址的反斜杠位置
int nPos = url.find('/', m_bHttps ? 8 : 7); //如果m_bHttps为真,那么从第8个之后的位置开始找,否则从第七个位置之后开始找
if (nPos == std::string::npos){
//http://www.163.com
m_host=url.substr(m_bHttps ? 8 : 7); //例如上面这种,如果主机后面没有/,那么直接从http://开始截取,截到最后
m_object = "/"; //像上面这种没有资源路径,那我们就给他们一个斜杠
}
else
{
//如果是这种情况https://www.microsoft.com/zh-cn/download/confirmation.aspx?id=40770
m_host = url.substr(m_bHttps ? 8 : 7, nPos - (m_bHttps ? 8 : 7));
m_object = url.substr(nPos);
}
if (m_host.empty()) //如果主机内容为空,意味着截取不到
return false;
return true;
}
//初始化网络
bool Init(){
WSADATA wd;
if (0 != WSAStartup(MAKEWORD(2, 2), &wd))
return false;
if (LOBYTE(wd.wVersion) != 2 || HIBYTE(wd.wVersion) != 2) //判断请求的是不是2.2版本
return false;
//创建套接字
m_socket=socket(AF_INET, SOCK_STREAM, 0);
}
//连接服务器
bool Connect(){
//将域名解析成对应的IP地址
HOSTENT * p=gethostbyname(m_host.c_str()); //P存放的内容就是由主机域名解析好后的ip地址,
if (p == NULL)
return false; //解析失败
//连接服务器
sockaddr_in sa;
sa.sin_family = AF_INET;
sa.sin_port = htons(443);
memcpy(&sa.sin_addr, p->h_addr, 4);
if (SOCKET_ERROR == connect(m_socket, (sockaddr*)&sa, sizeof(sockaddr)))
return false;
return true;
}
bool GetHtml(std::string& html){
std::string get;
get += "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0\r\n";
get += "Sec-Fetch-Dest: document\r\n";
get += "Connection: Close\r\n";
get += "\r\n";
//发送GET请求
if(SOCKET_ERROR==send(m_socket, get.c_str(), get.length(), 0)) std::cout << "GET请求发送失败" << std::endl;
//接收数据
char ch = 0;
while (recv(m_socket, &ch, sizeof(ch), 0)) html += ch;
return true;
}
};
string StartCatch(string url){
queue<string> q;
CHttp http;
string html;
q.push(url); //将获取到的url队列放入queue中
while (!q.empty()){
string currentUrl = q.front(); //将当前队列中的第一个url取出来
q.pop();
http.Init();
http.AnalyseURL(currentUrl);
cout << http.m_host << "\t\t" << http.m_object << endl;
if (false == http.Connect()) return "CONNECT_FALL";
http.GetHtml(html);
}
return html;
}
#endif
注:调用代码是
cout<<StartCatch("https://luogu.com.cn");
返回结果(cloudflare立大功)
luogu.com.cn /
HTTP/1.1 400 Bad Request
Server: cloudflare
Date: Mon, 03 Feb 2025 14:01:14 GMT
Content-Type: text/html
Content-Length: 155
Connection: close
CF-RAY: -
<html>
<head><title>400 Bad Request</title></head>
<body>
<center><h1>400 Bad Request</h1></center>
<hr><center>cloudflare</center>
</body>
</html>