Files
TwoNav/system/get_page_info.php

108 lines
4.8 KiB
PHP
Raw Normal View History

2023-04-05 16:17:28 +08:00
<?php
function get_page_info($output, $friend_link = '', $curl_info=array()) {
$page_info = array();
$page_info['site_title'] = ''; //标题
$page_info['site_description'] = ''; //描述
$page_info['site_keywords'] = ''; //关键字
$page_info['friend_link_status'] = 0; //友情链接检测
$page_info['site_home_size'] = 0; //字符串长度
if(empty($output)) return $page_info;
// 获取网页编码把非utf-8网页编码转成utf-8防止网页出现乱码
$meta_content_type = '';
if(isset($curl_info['content_type']) && strstr($curl_info['content_type'], "charset=") != "") {
$meta_content_type = explode("charset=", $curl_info['content_type'])[1];
}
if($meta_content_type == '') {
preg_match('/<META\s+http-equiv="Content-Type"\s+content="([\w\W]*?)"/si', $output, $matches); // 中文编码,如 http://www.qq.com
if (empty($matches[1])) {
preg_match('/<META\s+content="([\w\W]*?)"\s+http-equiv="Content-Type"/si', $output, $matches);
}
if (empty($matches[1])) {
preg_match('/<META\s+charset="([\w\W]*?)"/si', $output, $matches); // 特殊字符编码,如 http://www.500.com
}
if (!empty($matches[1]) && strstr($matches[1], "charset=") != "") {
$meta_content_type = explode("charset=", $matches[1])[1];
}
}
if(!in_array(strtolower($meta_content_type), array('','utf-8','utf8'))) {
$output = mb_convert_encoding($output, "utf-8", $meta_content_type); // gbk, gb2312
}
// 若网页仍然有乱码有乱码则gbk转utf-8
if(json_encode( $output ) == '' || json_encode( $output ) == null) {
$output = mb_convert_encoding($output, "utf-8", 'gbk');
}
$page_info['site_home_size'] = strlen($output);
// 标题
preg_match('/<TITLE>([\w\W]*?)<\/TITLE>/si', $output, $matches);
if (!empty($matches[1])) {
$page_info['site_title'] = $matches[1];
}
// 正则匹配获取全部的meta元数据
preg_match_all('/<META(.*?)>/si', $output, $matches);
$meta_str_array = $matches[0];
$meta_array = array();
$meta_array['description'] = '';
$meta_array['keywords'] = '';
foreach($meta_str_array as $meta_str) {
preg_match('/<META\s+name="([\w\W]*?)"\s+content="([\w\W]*?)"/si', $meta_str, $res);
if(!empty($res)) $meta_array[strtolower($res[1])] = $res[2];
preg_match('/<META\s+content="([\w\W]*?)"\s+name="([\w\W]*?)"/si', $meta_str, $res);
if(!empty($res)) $meta_array[strtolower($res[2])] = $res[1];
preg_match('/<META\s+http-equiv="([\w\W]*?)"\s+content="([\w\W]*?)"/si', $meta_str, $res);
if(!empty($res)) $meta_array[strtolower($res[1])] = $res[2];
preg_match('/<META\s+content="([\w\W]*?)"\s+http-equiv="([\w\W]*?)"/si', $meta_str, $res);
if(!empty($res)) $meta_array[strtolower($res[2])] = $res[1];
preg_match('/<META\s+scheme="([\w\W]*?)"\s+content="([\w\W]*?)"/si', $meta_str, $res);
if(!empty($res)) $meta_array[strtolower($res[1])] = $res[2];
preg_match('/<META\s+content="([\w\W]*?)"\s+scheme="([\w\W]*?)"/si', $meta_str, $res);
if(!empty($res)) $meta_array[strtolower($res[2])] = $res[1];
2023-07-20 14:03:00 +08:00
// 20230716 新增匹配语法
preg_match('/<META\s+content=[\'"](.*?)[\'"]\s+itemprop=[\'"](.*?)[\'"]\s+name=[\'"](.*?)[\'"]>/si', $meta_str, $res);
if(!empty($res)) $meta_array[strtolower($res[3])] = $res[1];
preg_match('/<meta\s+itemprop=[\'"](.*?)[\'"]\s+name=[\'"](.*?)[\'"]\s+content=[\'"](.*?)[\'"]>/si', $meta_str, $res);
if(!empty($res)) $meta_array[strtolower($res[2])] = $res[3];
2023-04-05 16:17:28 +08:00
}
2023-07-20 14:03:00 +08:00
//如果正则匹配失败则使用php函数尝试再次匹配
if(empty($meta_array['keywords']) || empty($meta_array['description'])){
//将html保存为临时文件
$key = md5(uniqid().Get_Rand_Str(8));
$tempFile = DIR ."/data/temp/".md5(uniqid().Get_Rand_Str(8)).".html";
file_put_contents($tempFile, $output);
$tags = get_meta_tags($tempFile);
unlink($tempFile); //删除临时文件
if(empty($meta_array['keywords']) && !empty($tags['keywords'])){
$meta_array['keywords'] = $tags['keywords'];
}
if(empty($meta_array['description']) && !empty($tags['description'])){
$meta_array['description'] = $tags['description'];
}
}
2023-04-05 16:17:28 +08:00
$page_info['site_keywords'] = $meta_array['keywords'];
$page_info['site_description'] = $meta_array['description'];
//$page_info['meta_array'] = $meta_array; //暂时不需要全部meta
2023-07-20 14:03:00 +08:00
2023-04-05 16:17:28 +08:00
# 判断是否存在友链
if(!empty($friend_link) && strstr($output, $friend_link) != "") {
$page_info['friend_link_status'] = 1;
}
2023-07-20 14:03:00 +08:00
2023-04-05 16:17:28 +08:00
return $page_info;
}