当前位置：网站首页 > PHP笔记 > 正文

PHP初级教程：PHP采集、抓取网页内容

作者：文煞发布时间：2024-05-27分类：PHP笔记浏览：5261

温馨提示：手机扫码可阅读当前文章!
文章简介：PHP是可以对网页内容进行采集，但是考虑到PHP的执行效率，还是推荐使用关关采集器或者火车头这类采集器。最好就是原创内容了，不过个人能力有限，原创内容可能跟不上网站的需要。以下是使用PHP采集网页内容的代码示例：我们可以使用 file_ge...

PHP是可以对网页内容进行采集，但是考虑到PHP的执行效率，还是推荐使用关关采集器或者火车头这类采集器。最好就是原创内容了，不过个人能力有限，原创内容可能跟不上网站的需要。

以下是使用PHP采集网页内容的代码示例：

我们可以使用 file_get_contents() 函数获取目标页面的HTML内容，然后用 DOMDocument 类解析HTML内容，最后通过 XPath 表达式找到所需要的节点，并获取节点的文本内容。

达到目的以后，我们可以输出或者保存获取到的标题和内容。不过在实际应用中，我们还需要考虑异常处理、编码转换、防止被网站屏蔽IP等问题。

<?php

// 采集目标网站页面的内容

$url = "https://www.wensha.info";

$html = file_get_contents($url);

// 解析HTML并获取需要的内容

$doc = new DOMDocument();

$doc->loadHTML($html);

$xpath = new DOMXPath($doc);

$title = $xpath->query("//title")->item(0)->nodeValue;

$content = $xpath->query("//div[@class='content']")->item(0)->nodeValue;

// 输出获取到的内容

echo "标题：" . $title . " ";

echo "内容：" . $content;

PHP网站内容采集教程、PHP网站内容采集工具推荐

PHP可以利用其内置的函数和第三方库来实现网站内容采集，常见的方法包括：

使用 file_get_contents() 函数获取目标页面的HTML内容。

$html = file_get_contents('https://www.wensha.info/');

使用 curl 扩展库来发送 HTTP 请求并获取响应。

$ch = curl_init();

curl_setopt($ch, CURLOPT_URL, "https://www.wensha.info");

curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);

$html = curl_exec($ch);

curl_close($ch);

使用第三方库，例如 Goutte 或 Simple HTML DOM Parser 等库，来解析 HTML 页面。以 Goutte 为例，示例代码如下所示：

use Goutte\Client;

$client = new Client();

$crawler = $client->request('GET', 'https://www.wensha.info');

// 通过 CSS 选择器获取需要的数据，例如标题和正文

$title = $crawler->filter('title')->text();

$content = $crawler->filter('.content')->text();

以上仅是基础实现方式，实际应用中还需要考虑反爬虫机制、异常处理、编码转换等问题。另外，应该注意遵守相关法律法规，尊重被采集网站的权益。

PHP采集的实际应用：

以下是一个用于采集某页面所有URL、处理URL为https或者http加上域名的格式，以此获取某网站的网站的T\K\D信息的代码！本代码原本是用于辅助之前开发的ZBLOG插件（E58分类目录）而写的，是一个简单采集网站信息并入库zblog数据库的小工具！省却了人工输入的麻烦和低效性！

图片

unction e58_dir_cj_main($url){

// 初始化一个空数组来存储处理后的url

$urls = array();

// 使用cURL库获取页面内容

$curl = curl_init($url);

curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);

// 忽略SSL证书验证

curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);

curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false);

$response = curl_exec($curl);

// 检查是否有错误发生

$curlError = curl_error($curl);

if($curlError){

echo "Error accessing URL: " . $curlError;

return $urls;

}

curl_close($curl);

// 使用正则表达式匹配所有的url

preg_match_all('/(?:https?:\/\/)?(?:[\w-]+\.)+[\w-]+/', $response, $matches);

// 处理匹配到的url

$processedUrls = array();

foreach($matches[0] as $match){

// 检查URL格式是否正确

if(!filter_var($match, FILTER_VALIDATE_URL)){

continue; // 跳过无效的URL

}

// 解析url

$parsedUrl = parse_url($match);

// 检查是否存在scheme和host

if(isset($parsedUrl['scheme']) && isset($parsedUrl['host'])){

$domain = $parsedUrl['scheme'] . '://' . $parsedUrl['host'];

// 添加到processedUrls数组中

if(!in_array($domain, $processedUrls)){

$processedUrls[] = $domain;

}

// 去重

$uniqueUrls = array_unique($processedUrls);

// 返回处理后的url数组

return $uniqueUrls;

}

class PageTKDExtractor {

public static function extractTKD($url) {

$curl = curl_init($url);

curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);

curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);

curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false);

curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true); // 允许跟随重定向

$response = curl_exec($curl);

$curlError = curl_error($curl);

if ($curlError) {

echo "Error accessing URL: " . $curlError;

return null;

}

$httpStatus = curl_getinfo($curl, CURLINFO_HTTP_CODE);

if ($httpStatus != 200) {

echo "Error accessing URL: HTTP status code " . $httpStatus;

return null;

}

curl_close($curl);

// 检测页面编码并转换到UTF-8

$encoding = mb_detect_encoding($response, ['UTF-8', 'GBK', 'ISO-8859-1']);

if ($encoding != 'UTF-8') {

$response = iconv($encoding, 'UTF-8//IGNORE', $response);

}

// 使用正则表达式匹配Title、Keywords和Description

$tkd = array();

preg_match('/<title>(.*?)<\/title>/i', $response, $title);

preg_match('/<meta\s+name="keywords"\s+content="(.*?)"\s*\/>/i', $response, $keywords);

preg_match('/<meta\s+name="description"\s+content="(.*?)"\s*\/>/i', $response, $description);

// 提取Title、Keywords和Description文本并存入数组

if (isset($title[1])) {

$tkd['title'] = trim($title[1]);

}

if (isset($keywords[1])) {

$tkd['keywords'] = trim($keywords[1]);

}

if (isset($description[1])) {

$tkd['description'] = trim($description[1]);

}

// 返回TKD数组

return $tkd;

}

对e58_dir_cj_main采集来的URL进行处理：

function e58_dir_cj_domain($url){

preg_match("/^(https?:\/\/)?([^\/]+)/", $url, $matches);

$domain = $matches[2];

return $domain;

}

function e58_dir_cj_protocol($url){

preg_match("/^https?:\/\//", $url, $matches);

if ($matches[0] === 'https://') {

$protocol = 'https';

} else {

$protocol = 'http';

}

return $protocol;

}

使用上面的PHP类和对象：

if (!empty($url)) {

echo '正在采集中... ';

$result = e58_dir_cj_main($url);

if ($result) {

foreach ($result as $v) {

echo '>>>正在处理：' . htmlspecialchars($v) . ' ';

$tkd = PageTKDExtractor::extractTKD($v);

if ($tkd) {

$site_title = mb_substr(preg_replace('/[\r\s]+/', '', trim(TransferHTML($tkd['title'] , '[nohtml]'))), 0, 40, 'utf-8');

$domain = e58_dir_cj_domain($v);

$ip = gethostbyname($domain);

$protocol = e58_dir_cj_protocol($v);

if($site_title == '没有找到站点'){break;}

if(isset($tkd['keywords'])){

$site_keywords = mb_substr(preg_replace('/[\r\s]+/', '', trim(TransferHTML($tkd['keywords'], '[nohtml]'))), 0, 120, 'utf-8');

}else{

$site_keywords = $site_title.','.$domain;

}

if(isset($tkd['description'])){

$site_description = mb_substr(preg_replace('/[\r\s]+/', '', trim(TransferHTML($tkd['description'], '[nohtml]'))), 0, 500, 'utf-8');

}else{

$site_description = $site_title.'，网站域名：'.$domain.'，网站服务器IP地址是：'.$ip;

}

echo ">>>网站标题: " . htmlspecialchars($site_title) . " >>>正在入库中...... ";

// 入库代码

$exist_post = $zbp->GetArticleList(array('*'),array(array('=','log_Status','0'),array('META_NAMEVALUE', 'log_Meta','e58_dir_domain',$domain ),),array('log_ViewNums'=>'DESC'),array(1),'');

if ($exist_post) {

echo ">>>入库失败：该网站已经存在！ ........................................................................................................................ ";

}else{

//数据入库代码块！

echo ">>>恭喜你，网站入库成功！（由于无法判断一个网站的分类，所以选择随机分类入库，望知悉！) ........................................................................................................................ ";

}

} else {

echo " >>>抱歉：未能成功获取到该站TDK信息！ ........................................................................................................................ ";

}

就这样简简单单的完成了这个简简单单的PHP采集任务！当然如果你需要更高个性化和高级的采集任务也是可以通过PHP代码来完成的，但是在编写代码的时候应该多注意PHP的采集性能以及表现情况！

(.*?)<\>/i', $response, $title); preg_match('/<meta\s+name="keywords"\s+content="(.*?)"\s*\>/i', $response, $keywords); preg_match('/<meta\s+name="description"\s+content="(.*?)"\s*\>/i', $response, $description); // 提取Title、Keywords和Description文本并存入数组 if (isset($title[1])) { $tkd['title'] = trim($title[1]); } if (isset($keywords[1])) { $tkd['keywords'] = trim($keywords[1]); } if (isset($description[1])) { $tkd['description'] = trim($description[1]); } // 返回TKD数组 return $tkd; } } 对e58_dir_cj_main采集来的URL进行处理： function e58_dir_cj_domain($url){ preg_match("/^(https?:\/\/)?([^\/]+)/", $url, $matches); $domain = $matches[2]; return $domain; } function e58_dir_cj_protocol($url){ preg_match("/^https?:\/\//", $url, $matches); if ($matches[0] === 'https://') { $protocol = 'https'; } else { $protocol = 'http'; } return $protocol; } 使用上面的PHP类和对象： if (!empty($url)) { echo '正在采集中... '; $result = e58_dir_cj_main($url); if ($result) { foreach ($result as $v) { echo '>>>正在处理：' . htmlspecialchars($v) . ' '; $tkd = PageTKDExtractor::extractTKD($v); if ($tkd) { $site_title = mb_substr(preg_replace('/[\r\s]+/', '', trim(TransferHTML($tkd['title'] , '[nohtml]'))), 0, 40, 'utf-8'); $domain = e58_dir_cj_domain($v); $ip = gethostbyname($domain); $protocol = e58_dir_cj_protocol($v); if($site_title == '没有找到站点'){break;} if(isset($tkd['keywords'])){ $site_keywords = mb_substr(preg_replace('/[\r\s]+/', '', trim(TransferHTML($tkd['keywords'], '[nohtml]'))), 0, 120, 'utf-8'); }else{ $site_keywords = $site_title.','.$domain; } if(isset($tkd['description'])){ $site_description = mb_substr(preg_replace('/[\r\s]+/', '', trim(TransferHTML($tkd['description'], '[nohtml]'))), 0, 500, 'utf-8'); }else{ $site_description = $site_title.'，网站域名：'.$domain.'，网站服务器IP地址是：'.$ip; } echo ">>>网站标题: " . htmlspecialchars($site_title) . " >>>正在入库中...... "; // 入库代码 $exist_post = $zbp->GetArticleList(array('*'),array(array('=','log_Status','0'),array('META_NAMEVALUE', 'log_Meta','e58_dir_domain',$domain ),),array('log_ViewNums'=>'DESC'),array(1),''); if ($exist_post) { echo ">>>入库失败：该网站已经存在！ ........................................................................................................................ "; }else{ //数据入库代码块！ echo ">>>恭喜你，网站入库成功！（由于无法判断一个网站的分类，所以选择随机分类入库，望知悉！) ........................................................................................................................ "; } } else { echo " >>>抱歉：未能成功获取到该站TDK信息！ ........................................................................................................................ "; } } } } ?> 就这样简简单单的完成了这个简简单单的PHP采集任务！当然如果你需要更高个性化和高级的采集任务也是可以通过PHP代码来完成的，但是在编写代码的时候应该多注意PHP的采集性能以及表现情况！</meta\s+name="description"\s+content="(.*?)"\s*\></meta\s+name="keywords"\s+content="(.*?)"\s*\></\>

󰄼 赞 0 赏󰄯 分享

欢迎您，来自美国的朋友，您的IP：3.22.77.179，您的网络：Amazon_EC2服务器

上一篇：PHP过滤器：验证过滤器与清理过滤器的使用
下一篇：帝国cms：古诗古籍名句网站源码，全套源码

PHP初级教程：PHP采集、抓取网页内容

相关推荐

咨询在线客服