<?php
class AdvancedWebPageFetcher {
    private $cacheDir;
    private $cacheTime = 31536000; // 1年缓存时间（秒）
    private $userAgents = array(); // User-Agent列表
    private $referers = array(); // Referer列表
    private $useProxy = false; // 是否使用代理
    private $proxyList = array(); // 代理服务器列表
    
    /**
     * 构造函数
     * @param string $cacheDir 缓存目录路径
     */
    public function __construct($cacheDir = 'cache') {
        $this->cacheDir = rtrim($cacheDir, '/') . '/';
        if (!file_exists($this->cacheDir)) {
            mkdir($this->cacheDir, 0755, true);
        }
        
        // 初始化常用User-Agent
        $this->userAgents = array(
            // Chrome
            'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36',
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36',
            
            // Firefox
            'Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:31.0) Gecko/20100101 Firefox/31.0',
            
            // Safari
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.13+ (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
            
            // IE
            'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
            'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
            
            // 移动设备
            'Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3',
            'Mozilla/5.0 (Linux; U; Android 2.3.3; en-us; HTC_DesireS_S510e Build/GRI40) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1'
        );
        
        // 初始化常用Referer
        $this->referers = array(
            'http://www.google.com/',
            'http://www.bing.com/',
            'http://www.yahoo.com/',
            'http://www.baidu.com/',
            'http://www.facebook.com/',
            'http://twitter.com/',
            'http://www.reddit.com/',
            'http://www.linkedin.com/',
            'http://www.youtube.com/'
        );
    }
    
    /**
     * 获取网页信息
     * @param string $url 要抓取的网址
     * @param bool $forceRefresh 是否强制刷新缓存
     * @param array $options 抓取选项
     * @return array 包含网页信息的数组
     */
    public function fetch($url, $forceRefresh = false, $options = array()) {
        $cacheFile = $this->getCacheFilename($url);
        
        // 检查缓存是否有效
        if (!$forceRefresh && file_exists($cacheFile)) {
            $cacheAge = time() - filemtime($cacheFile);
            if ($cacheAge < $this->cacheTime) {
                return $this->loadFromCache($cacheFile);
            }
        }
        
        // 从网络获取数据
        $data = $this->fetchFromWeb($url, $options);
        
        // 保存到缓存
        $this->saveToCache($cacheFile, $data);
        
        return $data;
    }
    
    /**
     * 从网络获取网页信息（增强版）
     * @param string $url 要抓取的网址
     * @param array $options 抓取选项
     * @return array 包含网页信息的数组
     * @throws Exception 如果抓取失败
     */
    private function fetchFromWeb($url, $options = array()) {
        $ch = curl_init();
        
        // 基本设置
        curl_setopt($ch, CURLOPT_URL, $url);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
        curl_setopt($ch, CURLOPT_HEADER, true);
        curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
        curl_setopt($ch, CURLOPT_MAXREDIRS, 5);
        curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 15);
        curl_setopt($ch, CURLOPT_TIMEOUT, 45);
        curl_setopt($ch, CURLOPT_ENCODING, ''); // 自动处理gzip/deflate编码
        curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
        curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
        
        // 随机User-Agent
        $userAgent = isset($options['user_agent']) ? $options['user_agent'] : $this->getRandomUserAgent();
        curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);
        
        // 伪造Referer
        if (isset($options['referer'])) {
            curl_setopt($ch, CURLOPT_REFERER, $options['referer']);
        } elseif ($this->shouldUseRandomReferer()) {
            curl_setopt($ch, CURLOPT_REFERER, $this->getRandomReferer());
        }
        
        // 伪造客户端IP
        $customHeaders = array();
        if (isset($options['client_ip'])) {
            $this->setClientIp($customHeaders, $options['client_ip']);
        } elseif ($this->shouldFakeIp()) {
            $this->setClientIp($customHeaders, $this->generateRandomIp());
        }
        
        // 使用代理
        if ($this->useProxy && !empty($this->proxyList)) {
            $proxy = $this->getRandomProxy();
            curl_setopt($ch, CURLOPT_PROXY, $proxy);
        }
        
        // 设置自定义请求头
        $headers = array();
        if (isset($options['headers']) && is_array($options['headers'])) {
            foreach ($options['headers'] as $key => $value) {
                $headers[] = $key . ': ' . $value;
            }
        }
        
        // 添加常见请求头
        $headers = array_merge($headers, $customHeaders, array(
            'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language: en-US,en;q=0.5',
            'Connection: keep-alive',
            'Pragma: no-cache',
            'Cache-Control: no-cache',
        ));
        
        curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
        
        $response = curl_exec($ch);
        
        if ($response === false) {
            $error = curl_error($ch);
            curl_close($ch);
            throw new Exception('CURL Error: ' . $error);
        }
        
        $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
        $headerSize = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
        $finalUrl = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
        
        curl_close($ch);
        
        $headers = substr($response, 0, $headerSize);
        $content = substr($response, $headerSize);
        
        return array(
            'url' => $finalUrl,
            'original_url' => $url,
            'http_code' => $httpCode,
            'headers' => $this->parseHeaders($headers),
            'content' => $content,
            'fetch_time' => time(),
            'from_cache' => false,
            'user_agent' => $userAgent
        );
    }
    
    /**
     * 设置客户端IP（通过HTTP头伪造）
     * @param array $headers 头部数组
     * @param string $ip 要伪造的IP地址
     */
    private function setClientIp(&$headers, $ip) {
        $headers[] = 'X-Forwarded-For: ' . $ip;
        $headers[] = 'Client-IP: ' . $ip;
        $headers[] = 'X-Real-IP: ' . $ip;
    }
    
    /**
     * 生成随机IP地址
     * @return string 随机IP
     */
    private function generateRandomIp() {
        return mt_rand(1, 255) . '.' . mt_rand(0, 255) . '.' . mt_rand(0, 255) . '.' . mt_rand(1, 254);
    }
    
    /**
     * 获取随机User-Agent
     * @return string 随机User-Agent
     */
    private function getRandomUserAgent() {
        return $this->userAgents[array_rand($this->userAgents)];
    }
    
    /**
     * 获取随机Referer
     * @return string 随机Referer
     */
    private function getRandomReferer() {
        return $this->referers[array_rand($this->referers)];
    }
    
    /**
     * 获取随机代理
     * @return string 随机代理地址
     */
    private function getRandomProxy() {
        return $this->proxyList[array_rand($this->proxyList)];
    }
    
    /**
     * 是否应该使用随机Referer
     * @return bool
     */
    private function shouldUseRandomReferer() {
        return !empty($this->referers);
    }
    
    /**
     * 是否应该伪造IP
     * @return bool
     */
    private function shouldFakeIp() {
        return true; // 默认启用
    }
    
    /**
     * 添加User-Agent
     * @param mixed $userAgent 字符串或数组
     */
    public function addUserAgent($userAgent) {
        if (is_array($userAgent)) {
            $this->userAgents = array_merge($this->userAgents, $userAgent);
        } else {
            $this->userAgents[] = $userAgent;
        }
    }
    
    /**
     * 添加Referer
     * @param mixed $referer 字符串或数组
     */
    public function addReferer($referer) {
        if (is_array($referer)) {
            $this->referers = array_merge($this->referers, $referer);
        } else {
            $this->referers[] = $referer;
        }
    }
    
    /**
     * 添加代理
     * @param mixed $proxy 字符串或数组
     */
    public function addProxy($proxy) {
        if (is_array($proxy)) {
            $this->proxyList = array_merge($this->proxyList, $proxy);
        } else {
            $this->proxyList[] = $proxy;
        }
        $this->useProxy = true;
    }
    
    /**
     * 启用/禁用代理
     * @param bool $use
     */
    public function useProxy($use = true) {
        $this->useProxy = $use;
    }
    
    /**
     * 解析头部信息
     * @param string $headerText 原始头部文本
     * @return array 解析后的头部数组
     */
    private function parseHeaders($headerText) {
        $headers = array();
        $lines = explode("\r\n", $headerText);
        
        foreach ($lines as $line) {
            if (strpos($line, ':') !== false) {
                list($key, $value) = explode(':', $line, 2);
                $headers[trim($key)] = trim($value);
            } elseif (trim($line) !== '') {
                $headers[] = trim($line);
            }
        }
        
        return $headers;
    }
    
    /**
     * 获取缓存文件名
     * @param string $url 网址
     * @return string 缓存文件路径
     */
    private function getCacheFilename($url) {
        $hash = md5($url);
        return $this->cacheDir . $hash . '.cache';
    }
    
    /**
     * 从缓存加载数据
     * @param string $cacheFile 缓存文件路径
     * @return array 缓存的网页数据
     */
    private function loadFromCache($cacheFile) {
        $data = unserialize(file_get_contents($cacheFile));
        $data['from_cache'] = true;
        return $data;
    }
    
    /**
     * 保存数据到缓存
     * @param string $cacheFile 缓存文件路径
     * @param array $data 要缓存的数据
     */
    private function saveToCache($cacheFile, $data) {
        file_put_contents($cacheFile, serialize($data));
    }
    
    /**
     * 设置缓存时间（秒）
     * @param int $seconds 缓存时间（秒）
     */
    public function setCacheTime($seconds) {
        $this->cacheTime = (int)$seconds;
    }
}

// 示例用法
try {
    // 创建实例，指定缓存目录
    $fetcher = new AdvancedWebPageFetcher('php52_cache');
    
    // 添加自定义User-Agent
    $fetcher->addUserAgent('Mozilla/5.0 (Windows NT 5.1; rv:3.0) Gecko/20100101 Firefox/3.0');
    
    // 添加自定义Referer
    $fetcher->addReferer('http://www.mysite.com/');
    
    // 添加代理服务器（可选）
    // $fetcher->addProxy('http://user:pass@proxy.example.com:8080');
    // $fetcher->useProxy(true); // 启用代理
    
    // 设置缓存时间为30天（可选）
    // $fetcher->setCacheTime(2592000);
    
    // 抓取选项
    $options = array(
        // 'user_agent' => '自定义User-Agent', // 可覆盖随机选择
        // 'referer' => 'http://example.com/referer', // 可覆盖随机选择
        // 'client_ip' => '123.45.67.89', // 自定义伪造IP
        // 'headers' => array( // 自定义头
        //     'X-Custom-Header' => 'Value',
        //     'Accept-Encoding' => 'gzip, deflate'
        // )
    );
    
    // 获取网页信息（第三个参数为抓取选项）
    $pageInfo = $fetcher->fetch('http://httpbin.org/headers', false, $options);
    
    // 输出结果
    echo "<h1>网页抓取结果</h1>";
    echo "<p><strong>请求URL:</strong> " . htmlspecialchars($pageInfo['original_url']) . "</p>";
    echo "<p><strong>实际URL:</strong> " . htmlspecialchars($pageInfo['url']) . "</p>";
    echo "<p><strong>HTTP状态码:</strong> " . $pageInfo['http_code'] . "</p>";
    echo "<p><strong>抓取时间:</strong> " . date('Y-m-d H:i:s', $pageInfo['fetch_time']) . "</p>";
    echo "<p><strong>来源缓存:</strong> " . ($pageInfo['from_cache'] ? '是' : '否') . "</p>";
    echo "<p><strong>使用的User-Agent:</strong> " . htmlspecialchars($pageInfo['user_agent']) . "</p>";
    
    echo "<h2>HTTP头信息</h2>";
    echo "<pre>";
    print_r($pageInfo['headers']);
    echo "</pre>";
    
    echo "<h2>网页内容预览</h2>";
    echo "<pre>";
    echo htmlspecialchars(substr($pageInfo['content'], 0, 1000)) . (strlen($pageInfo['content']) > 1000 ? '...' : '');
    echo "</pre>";
    
} catch (Exception $e) {
    echo "<div style='color:red;'><strong>错误:</strong> " . $e->getMessage() . "</div>";
}

//基本使用示例
/*$fetcher = new AdvancedWebPageFetcher('my_cache');
$pageInfo = $fetcher->fetch('http://example.com');

// 使用代理
$fetcher->addProxy('http://user:pass@proxy.example.com:8080');
$fetcher->useProxy(true);

// 自定义请求选项
$options = array(
    'client_ip' => '123.45.67.89',
    'referer' => 'http://google.com/',
    'headers' => array(
        'X-Custom-Header' => 'MyValue'
    )
);
$pageInfo = $fetcher->fetch('http://example.com', false, $options);*/
?>