function contentCollection($url){ $data = array('list'=>null,'status'=>0); if(!$url){ $data['info'] = '请传入采集地址'; return $data; } if(!preg_match("/^http/", $url)){ $url = 'http://'.$url; } preg_match("/^http(s)?:\/\/[^\/]+/", $url, $host_ary); $start = microtime(true); $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);//将curl_exec()获取的信息以字符串返回,而不是直接输出。 curl_setopt($ch, CURLOPT_FRESH_CONNECT, true);//TRUE 强制获取一个新的链接,而不是缓存中的链接。 //curl_setopt($ch, CURLOPT_FTPAPPEND, true);//为追加写入文件,而不是覆盖。 curl_setopt($ch, CURLOPT_FAILONERROR, true);//当 HTTP 状态码大于等于 400,TRUE 将将显示错误详情。默认状况下将返回页面,忽略 HTTP 代码。 curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B137 Safari/601.1');//设置UA curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 5);//在尝试链接时等待的秒数。设置为0,则无限等待。 //设置最大跳转次数 $redirects = 5; if (!ini_get('open_basedir') && !ini_get('safe_mode')) { curl_setopt($ch, CURLOPT_HEADER, false); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);//302重定向 curl_setopt($ch, CURLOPT_MAXREDIRS, $redirects); $content = curl_exec($ch); } else { curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false); curl_setopt($ch, CURLOPT_HEADER, true); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_FORBID_REUSE, false); do { $content = curl_exec($ch); if (curl_errno($ch)) break; $code = curl_getinfo($ch, CURLINFO_HTTP_CODE); if ($code != 301 && $code != 302) break; $header_start = strpos($content, "\r\n")+2; $headers = substr($content, $header_start, strpos($content, "\r\n\r\n", $header_start)+2-$header_start); if (!preg_match("!\r\n(?:Location|URI): *(.*?) *\r\n!", $headers, $matches)) break; curl_setopt($ch, CURLOPT_URL, $matches[1]); } while (--$redirects); if (!$redirects){ $data['info'] = '重定向次数太多。'; return $data; } } $http_code = curl_getinfo($ch, CURLINFO_HTTP_CODE); // 关闭cURL资源,而且释放系统资源 curl_close($ch); if(200 != $http_code){ $data['info'] = '采集失败,http_code:'.$http_code; return $data; } $content = preg_replace("/<\!--(.*?)-->/is", '', $content); preg_match_all("/<title[^>]*?>(.*?)<\/title>(.*)<body[^>]*?>(.*?)<\/body>/is", $content, $body,PREG_SET_ORDER); if(!$body[0]){ $data['info'] = '没有抓取到内容'; return $data; } $title = $body[0][1]; $collection_content = $body[0][3]; preg_match_all("/<link[^>]*rel=['|\"]stylesheet['|\"][^>]*>/is", $content, $link); $link_str = ''; if($link[0]){ $link[0] = preg_replace("/(href=['|\"])\//", '${1}'.$host_ary[0].'/', $link[0]); $link_str = implode('',$link[0]); } $collection_content = preg_replace("/(<img[^>]*?src=['|\"])\//", '${1}'.$host_ary[0].'/', $collection_content); $collection_content = $link_str . $collection_content; $data['status'] = 1; $data['info'] = mb_convert_encoding($collection_content, 'utf-8','GBK,UTF-8,ASCII'); $data['title'] = $title; return $data; }