在用curl抓取网页内容的时候,经常要知道,网页返回的请求头信息,和请求的相关信息,特别是在请求过程中存在重定向的时候获取请求返回头信息对分析请求内容很有帮助
下面就是一个请求中存在重定向的例子,我们的目的是要获取最终实际请求的url地址[php]
$url='http://www.appchina.com/market/r/489267/com.appshare.android.ilisten.vapk?c=aplus.direct&uid=gAJ9cQEu1TlyZxsXN-aB4RaanvFL6t6Bj-vj0rIBs&p=aplus.detail&m=redirect';
$ch=curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
//curl_setopt($ch, CURLOPT_POST, 1);
//curl_setopt($ch, CURLOPT_POSTFIELDS, $params);
curl_setopt($ch, CURLOPT_HEADER, 1);//返回response头部信息
curl_setopt($ch, CURLOPT_NOBODY, 1);//不返回response body内容
//curl_setopt($ch, CURLOPT_MAXREDIRS, 1);//设置请求最多重定向的次数
curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);//不直接输出response
curl_setopt($ch, CURLOPT_FOLLOWLOCATION,1);//如果返回的response 头部中存在Location值,就会递归请求
$content=curl_exec($ch);
$rinfo=curl_getinfo($ch);
echo $content,"</br>";
echo "<hr>";
print_r($rinfo);
[plain]
HTTP/1.1 200 OKServer: nginxDate: Sat, 22 Dec 2012 06:17:44 GMTContent-Type: application/vnd.android.package-archiveConnection: closeLast-Modified: Mon, 03 Dec 2012 16:00:00 GMTExpires: Tue, 03 Dec 2013 16:00:00 GMTCache-Control: max-age=31536000Content-Length: 2142149
Array( [url] => <a href="http://www.d.appchina.com/McDonald/r/489267/com.appshare.android.ilisten.vapk?c=aplus.direct&uid=gAJ9cQEu1TlyZxsXN-aB4RaanvFL6t6Bj-vj0rIBs&p=aplus.detail&m=redirect" target="_blank">http://www.d.appchina.com/McDonald/r/489267/com.appshare.android.ilisten.vapk?c=aplus.direct&uid=gAJ9cQEu1TlyZxsXN-aB4RaanvFL6t6Bj-vj0rIBs&p=aplus.detail&m=redirect</a> [content_type] => application/vnd.android.package-archive [http_code] => 200 [header_size] => 289 [request_size] => 196 [filetime] => -1 [ssl_verify_result] => 0 [redirect_count] => 0 [total_time] => 0.171621 [namelookup_time] => 0.135256 [connect_time] => 0.152913 [pretransfer_time] => 0.152916 [size_upload] => 0 [size_download] => 0 [speed_download] => 0 [speed_upload] => 0 [download_content_length] => 2142149 [upload_content_length] => 0 [starttransfer_time] => 0.171582 [redirect_time] => 0 [certinfo] => Array ( ))
[php]
$url='http://www.appchina.com/market/r/489267/com.appshare.android.ilisten.vapk?c=aplus.direct&uid=gAJ9cQEu1TlyZxsXN-aB4RaanvFL6t6Bj-vj0rIBs&p=aplus.detail&m=redirect';
[php] view plaincopy
$realUrl=getRedirectLocation($url);
echo "</br>--->",$realUrl;
function getRedirectLocation($url){
$realUrl=$url;
echo $url,"</br>";
$ch=curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, 1);curl_setopt($ch, CURLOPT_TIMEOUT, 3);//设置curl执行时间不超过3秒
//curl_setopt($ch, CURLOPT_NOBODY, 1);//这行不能要,如果添上,那么在遇到302重定向的时候就会得不到真正的请求url
curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);
$content=curl_exec($ch);
//echo $content;
$rinfo=curl_getinfo($ch);
$matches=array();
if(preg_match('/Location:\s+?(.+?)\s+?/', $content,$matches)){
//echo $matches[1],"</br>";
unset($content);
$realUrl=getRedirectLocation($matches[1]);
}
if(isset($content)){
unset($content);
}
return $realUrl;
}
给各位介绍一下Curl多线程实例与原理。不对之处请指教
相信许多人对php手册中语焉不详的curl_multi一族的函数头疼不已,它们文档少,给的例子 更是简单的让你无从借鉴,我也曾经找了许多网页,都没见一个完整的应用例子。curl_multi_add_handle curl_multi_close curl_multi_exec curl_multi_getcontent curl_multi_info_read curl_multi_init curl_multi_remove_handle curl_multi_select 一般来说,想到要用这些函数时,目的显然应该是要同时请求多个url,而不是一个一个依次请求,否则不如自己循环去调curl_exec好了。步骤总结如下:第一步:调用curl_multi_init第二步:循环调用curl_multi_add_handle这一步需要注意的是,curl_multi_add_handle的第二个参数是由curl_init而来的子handle。第三步:持续调用curl_multi_exec第四步:根据需要循环调用curl_multi_getcontent获取结果第五步:调用curl_multi_remove_handle,并为每个字handle调用curl_close第六步:调用curl_multi_close 这里有PHP手册上的例子:[php]
<?php
// 创建一对cURL资源
$ch1 = curl_init();
$ch2 = curl_init();
// 设置URL和相应的选项
curl_setopt($ch1, CURLOPT_URL, "http://www.jb51.net/");
curl_setopt($ch1, CURLOPT_HEADER, 0);
curl_setopt($ch2, CURLOPT_URL, "http://www.php.net/");
curl_setopt($ch2, CURLOPT_HEADER, 0);
// 创建批处理cURL句柄
$mh = curl_multi_init();
// 增加2个句柄
curl_multi_add_handle($mh,$ch1);
curl_multi_add_handle($mh,$ch2);
$active = null;
// 执行批处理句柄
do {
$mrc = curl_multi_exec($mh, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
while ($active && $mrc == CURLM_OK) {
if (curl_multi_select($mh) != -1) {
do {
$mrc = curl_multi_exec($mh, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
}
}
// 关闭全部句柄
curl_multi_remove_handle($mh, $ch1);
curl_multi_remove_handle($mh, $ch2);
curl_multi_close($mh);
?>
[php]
do {
$mrc = curl_multi_exec($mh,$active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
while ($active and $mrc == CURLM_OK) {
if (curl_multi_select($mh) != -1) {
do {
$mrc = curl_multi_exec($mh, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
}
}
curl不使用文件存取cookie php使用curl获取cookie示例
[php]
/*-----保存COOKIE-----*/
$url = 'www.xxx.com'; //url地址
$post = "id=user&pwd=123456"; //POST数据
$ch = curl_init($url); //初始化
curl_setopt($ch,CURLOPT_HEADER,1); //将头文件的信息作为数据流输出
curl_setopt($ch,CURLOPT_RETURNTRANSFER,1); //返回获取的输出文本流
curl_setopt($ch,CURLOPT_POSTFIELDS,$post); //发送POST数据
$content = curl_exec($ch); //执行curl并赋值给$content
preg_match('/Set-Cookie:(.*);/iU',$content,$str); //正则匹配
$cookie = $str[1]; //获得COOKIE(SESSIONID)
curl_close($ch); //关闭curl
/*-----使用COOKIE-----*/
curl_setopt($ch,CURLOPT_COOKIE,$cookie);
如下:
[php]
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, 'http://www.phpddt.com');
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
$con = curl_exec($ch);
curl_close($ch);
[php]
<?php
/**
* cURL multi批量处理
*
* @author mckee
* @link <a href="http://www.phpddt.com" target="_blank">http://www.phpddt.com</a> *
*/
$url_array = array(
'http://www.phpddt.com/',
'http://www.phpddt.com/php/627.html',
'/content/312816.html'
);
$handles = $contents = array();
//初始化curl multi对象
$mh = curl_multi_init();
//添加curl 批处理会话
foreach($url_array as $key => $url)
{
$handles[$key] = curl_init($url);
curl_setopt($handles[$key], CURLOPT_RETURNTRANSFER, 1);
curl_setopt($handles[$key], CURLOPT_TIMEOUT, 10);
curl_multi_add_handle($mh, $handles[$key]);
}
//======================执行批处理句柄=================================
$active = null;
do {
$mrc = curl_multi_exec($mh, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
while ($active and $mrc == CURLM_OK) {
if(curl_multi_select($mh) === -1){
usleep(100);
}
do {
$mrc = curl_multi_exec($mh, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
}
//====================================================================
//获取批处理内容
foreach($handles as $i => $ch)
{
$content = curl_multi_getcontent($ch);
$contents[$i] = curl_errno($ch) == 0 ? $content : '';
}
//移除批处理句柄
foreach($handles as $ch)
{
curl_multi_remove_handle($mh, $ch);
}
//关闭批处理句柄
curl_multi_close($mh);
print_r($contents);
[php]
do { $n=curl_multi_exec($mh,$active); } while ($active);
[php]
/********************** curl 系列 ***********************/
//直接通过curl方式取得数据(包含POST、HEADER等)
/*
* $url: 如果非数组,则为http;如是数组,则为https
* $header: 头文件
* $post: post方式提交 array形式
* $cookies: 0默认无cookie,1为设置,2为获取
*/
public function curl_allinfo($urls, $header = FALSE, $post = FALSE, $cookies = 0) {
$url = is_array($urls) ? $urls['0'] : $urls;
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
//带header方式提交
if($header != FALSE){
curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
}
//post提交方式
if($post != FALSE){
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_POSTFIELDS, $post);
}
if($cookies == 1){
curl_setopt($ch, CURLOPT_COOKIEJAR, "cookiefile");
}else if($cookies == 2){
curl_setopt($ch, CURLOPT_COOKIEFILE, "cookiefile");
}
if(is_array($urls)){
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
}
$data = curl_exec($ch);
curl_close($ch);
return $data;
}
cURL 是一个功能强大的PHP库,使用PHP的cURL库可以简单和有效地抓取网页并采集内容,设置cookie完成模拟登录网页,curl提供了丰富的函数,开发者可以从PHP手册中获取更多关于cURL信息。本文以模拟登录开源中国(oschina)为例,和大家分享cURL的使用。
PHP的curl()在抓取网页的效率方面是比较高的,而且支持多线程,而file_get_contents()效率就要稍低些,当然,使用curl时需要开启下curl扩展。代码实战先来看登录部分的代码:[php]
//模拟登录
function login_post($url, $cookie, $post) {
$curl = curl_init();//初始化curl模块
curl_setopt($curl, CURLOPT_URL, $url);//登录提交的地址
curl_setopt($curl, CURLOPT_HEADER, 0);//是否显示头信息
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 0);//是否自动显示返回的信息
curl_setopt($curl, CURLOPT_COOKIEJAR, $cookie); //设置Cookie信息保存在指定的文件中
curl_setopt($curl, CURLOPT_POST, 1);//post方式提交
curl_setopt($curl, CURLOPT_POSTFIELDS, http_build_query($post));//要提交的信息
curl_exec($curl);//执行cURL
curl_close($curl);//关闭cURL资源,并且释放系统资源
}
[php]
//登录成功后获取数据
function get_content($url, $cookie) {
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_COOKIEFILE, $cookie); //读取cookie
$rs = curl_exec($ch); //执行cURL抓取页面内容
curl_close($ch);
return $rs;
}
[php]
//设置post的数据
$post = array (
'email' => 'oschina账户',
'pwd' => 'oschina密码',
'goto_page' => '/my',
'error_page' => '/login',
'save_login' => '1',
'submit' => '现在登录'
);
//登录地址
$url = "http://m.jb51.net/action/user/login";
//设置cookie保存路径
$cookie = dirname(__FILE__) . '/cookie_jb51.txt';
//登录后要获取信息的地址
$url2 = "http://m.jb51.net/my";
//模拟登录
login_post($url, $cookie, $post);
//获取登录页的信息
$content = get_content($url2, $cookie);
//删除cookie文件
@ unlink($cookie);
//匹配页面信息
$preg = "/<td class='portrait'>(.*)<\/td>/i";
preg_match_all($preg, $content, $arr);
$str = $arr[1][0];
//输出内容
echo $str;
网上的很多模拟登录程序,大都是通过服务程序apache之类的运行,获取到验证码之后显示在网页上,然后填上再POST出去,这样虽然看起来很友好,但是既然模拟登录,登录后所干的事情就不一定是短时间完成的,所以这就要受到php最大执行时间的限制,而且有些操作还有可能权限不足。
本文提供了一个程序实例,思路就是获取到验证码之后把验证码存储为一个图片,然后程序休眠20秒,在20秒之后由用户手动查看图片,并把验证码填写到code.txt文件中,20秒休眠完成后,程序会读code.txt的验证码,这样再带着验证码进行登录操作。具体代码如下:[php]
/**
* 模拟登录
*/
//初始化变量
$cookie_file = "tmp.cookie";
$login_url = "http://xxx.com/logon.php";
$verify_code_url = "http://xxx.com/verifyCode.php";
echo "正在获取COOKIE...\n";
$curlj = curl_init();
$timeout = 5;
curl_setopt($curl, CURLOPT_URL, $login_url);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($curl, CURLOPT_CONNECTTIMEOUT, $timeout);
curl_setopt($curl,CURLOPT_COOKIEJAR,$cookie_file); //获取COOKIE并存储
$contents = curl_exec($curl);
curl_close($curl);
echo "COOKIE获取完成,正在取验证码...\n";
//取出验证码
$curl = curl_init();
curl_setopt($curl, CURLOPT_URL, $verify_code_url);
curl_setopt($curl, CURLOPT_COOKIEFILE, $cookie_file);
curl_setopt($curl, CURLOPT_HEADER, 0);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
$img = curl_exec($curl);
curl_close($curl);
$fp = fopen("verifyCode.jpg","w");
fwrite($fp,$img);
fclose($fp);
echo "验证码取出完成,正在休眠,20秒内请把验证码填入code.txt并保存\n";
//停止运行20秒
sleep(20);
echo "休眠完成,开始取验证码...\n";
$code = file_get_contents("code.txt");
echo "验证码成功取出:$code\n";
echo "正在准备模拟登录...\n";
$post = "username=maben&pwd=hahahaha&verifycode=$code";
$curl = curl_init();
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_HEADER, false);
curl_setopt($curl, CURLOPT_RETURNTRANSFER,1);
curl_setopt($curl, CURLOPT_POSTFIELDS, $post);
curl_setopt($curl, CURLOPT_COOKIEFILE, $cookie_file);
$result=curl_exec($curl);
curl_close($curl);
//这一块根据自己抓包获取到的网站上的数据来做判断
if(substr_count($result,"登录成功")){
echo "登录成功\n";
}else{
echo "登录失败\n";
exit;
}
//OK,开始做你想做的事吧。。。。。