利用perl基础库抓取百度博客,生成wp导入文件

# Author : thicket
# Date : 2013/01/31
# WebSite : hi.baidu.com
# 在当前文件夹生成以日期为文件名的xml文件,可以导入wordpress

use LWP::Simple;
use HTML::Parse;
use HTML::Element;
use URI::URL;
use LWP::UserAgent;
use HTTP::Request;
use HTTP::Response;
use URI::Escape;
use POSIX;


$website = $ARGV[0];

if(!$website){
        print "=== add website ! ===\n";
        exit;
}

$website = 'http://hi.baidu.com/'.$website;

my($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time());
my $format_time = sprintf("%d-%d-%d",$year+1900,$mon+1,$mday,$hour,$min,$sec);

$file_name = './baidu'.$format_time.'.xml';
if(open(OF,">$file_name")){
#    print OF ("Here is an output line.\n");

$ua = new LWP::UserAgent;        # 產生 UserAgent 物件

print OF ("$website\n");

my $pages_totle, $pages_row;

$_ = get "$website?page=1";
($pages_totle, $pages_row) = getPageNum($_);
my $len = ceil($pages_totle/$pages_row);

for ($count = 1; $count <= $len; $count++) {

    $url_ind = "$website?page=$count";
print $url_ind."\n";

    $request = new HTTP::Request('GET', $url_ind);	# 產生 Request 物件
    $response = $ua->request($request);    # 開始抓取網頁,並將結果傳會 $response
    if ($response->is_success) {    	# 若抓取網頁成功,則印出 HTML 原始碼
        $_ = $response->content;

my $rss = '<?xml version="1.0" encoding="UTF-8" ?>
<rss version="2.0"
	xmlns:excerpt="http://wordpress.org/export/1.2/excerpt/"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:wp="http://wordpress.org/export/1.2/"
>
<channel>';

        my @words = ($_ =~ m/(<a[^>]+?#reply[^>]+?>)/gi);
        my $words = join('',@words);
        $words =~ s/[\\]|#reply//ig;

        my @links = getLink($words);

        foreach(@links){
            $url = $_;
            $request = new HTTP::Request('GET', $url);	# 產生 Request 物件
            $response = $ua->request($request);
            if ($response->is_success) {
                ($date, $title, $tag, $content) = getContent($response->content);
                print '===============================';
                print $url."\n";
                print $date."\n".$title."\n".$tag."\n";
                @tag = split(' ',$tag);

$rss =	"<item>
    <title>$title</title>
    <link></link>
    <pubDate>Tue, 15 Jan 2013 12:53:41 +0000</pubDate>
    <dc:creator>thicket</dc:creator>
    <guid isPermaLink=\"false\"></guid>
    <description></description>
    <content:encoded><![CDATA[$content]]></content:encoded>
    <excerpt:encoded><![CDATA[]]></excerpt:encoded>
    <wp:post_id></wp:post_id>
    <wp:post_date>$date</wp:post_date>
    <wp:post_date_gmt>2013-01-15 12:53:41</wp:post_date_gmt>
    <wp:comment_status>open</wp:comment_status>
    <wp:ping_status>open</wp:ping_status>
    <wp:post_name>";
$rss .= uri_escape($title);
$rss .= "</wp:post_name>
    <wp:status>publish</wp:status>
    <wp:post_parent>0</wp:post_parent>
    <wp:menu_order>0</wp:menu_order>
    <wp:post_type>post</wp:post_type>
    <wp:post_password></wp:post_password>
    <wp:is_sticky>0</wp:is_sticky>";
foreach(@tag){
    $rss .= "
        <category domain=\"post_tag\" nicename=";
        $rss .= uri_escape($_);
        $rss .= "><![CDATA[$_]]></category>";
}
    $rss .= "
        <wp:postmeta>
    	<wp:meta_key>_edit_last</wp:meta_key>
    	<wp:meta_value><![CDATA[1]]></wp:meta_value>
    </wp:postmeta>
	</item>";

print OF ("$rss\n");
            }else{
                print $response->error_as_HTML;
            }
        }
print OF ("</channel></rss>\n");
    } else {                # 若抓取網頁不成功,則印出錯誤訊息
	    print $response->error_as_HTML;
    }
}

close(OF);
}else{
    print "open file error \n";
    exit;
}



##########################################################################################

#获取文章连接
sub getLink{
    my @full_url;
    $parsed_html = HTML::Parse::parse_html(@_[0]);
    for (@{ $parsed_html->extract_links("a") }) {
	    $link = $_->[0];
	    $url = new URI::URL $link;
	    push(@full_url, $url->abs($website));
    }
    return @full_url;
}


#获取html
sub getContent{
    $_ = @_[0];
    my @date = ($_ =~ m/<div[^>]+class=content-other-info>\s*(.+?)\s*<\/div>/i);
    my $date = join('',@date);
    $date =~ s/<[^>]*>//g;

    my @title = ($_ =~ m/<h2 class="title content-title">(.+?)<\/h2>/i);
    my $title = join('',@title);

    my @content = ($_ =~ m/<div id=content[^>]+>(.+?)<\/div>/i);
    my $content = join('',@content);

    my @tag = ($_ =~ m/<a class="tag"[^>]+>#(.+?)<\/a>/gi);
    my $tag = join(' ',@tag);
    $tag =~ s/<[^>]*>//g;

    return ($date, $title, $tag, $content);
}

#取得页数
sub getPageNum{
    $_ = @_[0];
    my @pages = ($_ =~ m/allCount.*,/gi);
    @pages = (join("",@pages) =~ m/[0-9]+/gi);
    $pages_totle = join("\n",@pages);

    my @pages = ($_ =~ m/pageSize.*,/gi);
    @pages = (join("",@pages) =~ m/[0-9]+/gi);
    $pages_row = join("\n",@pages);

    return ($pages_totle, $pages_row);
}

编程技巧