MENU

【PHP】精准判断百度是否收录文章(URL)

January 20, 2020 • Read: 597 • 程序源码

不是专业搞SEO,也不会啥高级玩法,原理和以前的一些方法一样,搜索关键词再判断是否存在收录,换汤不换药,结合百度的高级搜索能够更为精确一点点(限制地址,限制标题);本博客文章页判断是否收录也是走的这个.

百度高级搜索.png

简单封装的一个函数,感兴趣的可以玩玩~,PHP >7.0

function baidu($url = '', $title = '', $host = ''): array
{
    $ret = ['state' => false, 'msg' => 'error'];

    if (empty($url) && empty($host)) {
        $host = $_SERVER['HTTP_HOST'];
    } elseif (empty($host)) {
        $info = parse_url($url);
        if ($info === false) {
            $ret['msg'] = 'no host';

            return $ret;
        }

        $host = $info['host'];
    }
    if (empty($title)) {
        $res = getCurl($url);
        if (empty($res)) {
            $ret['msg'] = 'no title';

            return $ret;
        }

        $title = getStrBy2Char($res, 'title>', '</title');
    }

    $ret['title'] = $title;
    $ret['host']  = $host;
    $title        = urlencode($title);
    $checkUrl     = "https://www.baidu.com/s?q1={$title}&q2=&q3=&q4=&rn=10&lm=0&ct=0&ft=&q5=1&q6={$host}&tn=baiduadv";
    $response     = getCurl($checkUrl);

    if (mb_strpos($response, '很抱歉,没有找到') != false) {
        $ret['msg'] = '没有找到相关网页';

        return $ret;
    }
    if (mb_strpos($response, 'result c-container ') !== false) {
        $ret['state'] = true;
        $ret['msg']   = 'success';
    }

    return $ret;
}

function getCurl($url, $opt = [])
{
    $cookie = '';
    if (is_array($opt['cookie'])) {
        foreach ($opt['cookie'] as $k => $v) {
            $cookie .= $k . '=' . $v . '; ';
        }
    }

    $cookie = (mb_substr($cookie, 0, mb_strlen($cookie) - 2));

    $ch = curl_init();
    curl_setopt($ch, CURLOPT_URL, $url);
    curl_setopt($ch, CURLOPT_COOKIE, $cookie);
    curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
    curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
    curl_setopt($ch, CURLOPT_NOBODY, $opt['nobody']);
    curl_setopt($ch, CURLOPT_HEADER, $opt['header'] ?? false);
    curl_setopt($ch, CURLOPT_HTTPHEADER, $opt['headers'] ?? []);
    curl_setopt($ch, CURLOPT_TIMEOUT_MS, $opt['rtime'] ?? 10000);
    curl_setopt($ch, CURLOPT_CONNECTTIMEOUT_MS, $opt['ctime'] ?? 10000);
    curl_setopt($ch, CURLOPT_REFERER, $opt['refer'] ?? 'https://www.baidu.com/');
    curl_setopt($ch, CURLOPT_USERAGENT, $opt['UA'] ?? 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36');
    if (isset($opt['post']) && is_array($opt['post'])) {
        curl_setopt($ch, CURLOPT_POST, 1);
        curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query($opt['post']));
    }
    if (isset($opt['proxy']) && is_array($opt['proxy'])) {
        curl_setopt($ch, CURLOPT_PROXY, $opt['proxy']['ip']);
        curl_setopt($ch, CURLOPT_PROXYPORT, $opt['proxy']['port']);
    }
    $res   = curl_exec($ch);
    $error = curl_error($ch);
    $code  = curl_getinfo($ch, CURLINFO_HTTP_CODE);
    curl_close($ch);

    if ($opt['detail']) {
        return ['code' => $code, 'error' => $error, 'response' => $res,];
    }

    return $res;
}

function getStrBy2Char($str, $before, $end = '')
{
    $beginIndex = mb_strpos($str, $before);
    if ($beginIndex == false) {
        return '';
    }

    $cutStr   = mb_substr($str, $beginIndex);
    $endIndex = mb_strpos($cutStr, $end);
    if (empty($end) || $endIndex === false) {

        return $cutStr;
    }

    return mb_substr($cutStr, mb_strlen($before), $endIndex - mb_strlen($before));
}

Last Modified: June 13, 2020