php curl 内容采集

时间 2019-11-13
标签 php curl 内容采集栏目 PHP 繁體版
原文原文链接
function contentCollection($url){
        $data = array('list'=>null,'status'=>0);
        if(!$url){
            $data['info'] = '请传入采集地址';
            return $data;
        }
        if(!preg_match("/^http/", $url)){
            $url = 'http://'.$url;
        }
        preg_match("/^http(s)?:\/\/[^\/]+/", $url, $host_ary);
        $start = microtime(true);
        $ch = curl_init();
        curl_setopt($ch, CURLOPT_URL, $url);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);//将curl_exec()获取的信息以字符串返回，而不是直接输出。
        curl_setopt($ch, CURLOPT_FRESH_CONNECT, true);//TRUE 强制获取一个新的链接，而不是缓存中的链接。
        //curl_setopt($ch, CURLOPT_FTPAPPEND, true);//为追加写入文件，而不是覆盖。
        curl_setopt($ch, CURLOPT_FAILONERROR, true);//当 HTTP 状态码大于等于 400，TRUE 将将显示错误详情。默认状况下将返回页面，忽略 HTTP 代码。
        curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B137 Safari/601.1');//设置UA

        curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 5);//在尝试链接时等待的秒数。设置为0，则无限等待。
        //设置最大跳转次数
        $redirects = 5;
        if (!ini_get('open_basedir') && !ini_get('safe_mode')) {
            curl_setopt($ch, CURLOPT_HEADER, false);
            curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);//302重定向
            curl_setopt($ch, CURLOPT_MAXREDIRS, $redirects);
            $content = curl_exec($ch);
        } else {
            curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
            curl_setopt($ch, CURLOPT_HEADER, true);
            curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
            curl_setopt($ch, CURLOPT_FORBID_REUSE, false);
            do {
                $content = curl_exec($ch);
                if (curl_errno($ch))
                    break;
                $code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
                if ($code != 301 && $code != 302)
                    break;
                $header_start = strpos($content, "\r\n")+2;
                $headers = substr($content, $header_start, strpos($content, "\r\n\r\n", $header_start)+2-$header_start);
                if (!preg_match("!\r\n(?:Location|URI): *(.*?) *\r\n!", $headers, $matches))
                    break;
                curl_setopt($ch, CURLOPT_URL, $matches[1]);
            } while (--$redirects);
            if (!$redirects){
                $data['info'] = '重定向次数太多。';
                return $data;
            }
        }
        $http_code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
        // 关闭cURL资源，而且释放系统资源
        curl_close($ch);

        if(200 != $http_code){
            $data['info'] = '采集失败,http_code:'.$http_code;
            return $data;
        }
        $content = preg_replace("/<\!--(.*?)-->/is", '', $content);

        preg_match_all("/<title[^>]*?>(.*?)<\/title>(.*)<body[^>]*?>(.*?)<\/body>/is", $content, $body,PREG_SET_ORDER);
        if(!$body[0]){
            $data['info'] = '没有抓取到内容';
            return $data;
        }
        $title = $body[0][1];
        $collection_content = $body[0][3];
        preg_match_all("/<link[^>]*rel=['|\"]stylesheet['|\"][^>]*>/is", $content, $link);
        $link_str = '';
        if($link[0]){
            $link[0] = preg_replace("/(href=['|\"])\//", '${1}'.$host_ary[0].'/', $link[0]);
            $link_str = implode('',$link[0]);
        }
        $collection_content = preg_replace("/(<img[^>]*?src=['|\"])\//", '${1}'.$host_ary[0].'/', $collection_content);
        $collection_content = $link_str . $collection_content;
        $data['status'] = 1;
        $data['info'] = mb_convert_encoding($collection_content, 'utf-8','GBK,UTF-8,ASCII');
        $data['title'] = $title;
        return $data;
    }