不想做诗人的程序员不是一个好爸爸

2015/12/14 16808
#!/usr/bin/perl

use strict;
#use warnings;
use encoding 'utf8';
use Data::Dumper;

sub dump_chars();
sub dump_sentenses();
sub dump_index();
sub recount_chars;
sub mark_chars;
sub dump_result;
sub count_unsed_chars;
sub count_weight;

# 读取字频表
my %chars;
open(CHAR_FILE, '<', 'lex-chars.lex');
while (my $line = <CHAR_FILE>) {
    chomp($line);
    my @items = split(/\//, $line);
    print "bad line: $line\n" if (scalar(@items) != 3);
    $chars{$items[0]} = $items[2];
}
close(CHAR_FILE);

# 排序，按字频输出
my @sorted_chars = sort { $chars{$b} <=> $chars{$a} } keys %chars;
#dump_chars();

# 筛选出频率前1000个字
my %top_chars;
for (my $i = 0; $i < 2000; $i++) {
    $top_chars{$sorted_chars[$i]} = 1;
}

# 读取名句语料库
my %sentenses;
my %char_sentenses;
my %first_chars;
my %last_chars;
while (my $file = <r01*>) {
    # print "reading $file\n";
    open(FILE, '<', $file);
    
    while (my $line = <FILE>) {
        chomp($line);
        $line =~ s/^\s+//g;
        $line =~ s/\s+$//g;
        next if (!$line);
        $line =~ s/。$//;
        $line =~ s/！$//;
        $line =~ s/？$//;
        $line =~ s/，$//;
        next if ($line =~ /^—/);
        next if (length($line) < 10);
        next if (length($line) > 16);
        next if ($line =~ /^（/);
        next if ($line =~ /《/);
        next if ($line =~ /[0-9A-Za-z]/);
        next if ($line =~ /^，/);
        next if ($line =~ /-/);
        $sentenses{$line}++;
        
        my $first_char = substr($line, 0, 1);
        my $last_char = substr($line, length($line) - 1, 1);
        $first_chars{$first_char} = 1;
        $last_chars{$last_char} = 1;
    }
    
    close(FILE);
}

# 排序，按句频输出
my @sorted_sentenses = sort { $sentenses{$b} <=> $sentenses{$a} } keys %sentenses;
#dump_sentenses();

# 过滤只出现过一次的句子
#while ($sentenses{$sorted_sentenses[$#sorted_sentenses]} < 2) {
#    pop(@sorted_sentenses);
#}
print 'total sentenses: ' . scalar(@sorted_sentenses) . "\n";

# 过滤末字无法接的句子
foreach my $sentense (keys(%sentenses)) {
    my $last_char = substr($sentense, length($sentense) - 1, 1);
    if (!$first_chars{$last_char}) {
        #print "delete $sentense\n";
        delete($sentenses{$sentense});
    }
}
print 'total sentenses: ' . scalar(%sentenses) . "\n";

# 过滤首字无法接的句子
foreach my $sentense (keys(%sentenses)) {
    my $first_char = substr($sentense, 0, 1);
    if (!$last_chars{$first_char}) {
        #print "delete $sentense\n";
        delete($sentenses{$sentense});
    }
}
print 'total sentenses: ' . scalar(%sentenses) . "\n";

foreach my $sentense (keys(%sentenses)) {
    # 建立首字索引
    my $first_char = substr($sentense, 0, 1);
    if (!$char_sentenses{$first_char}) {
        $char_sentenses{$first_char} = [];
    }

    push($char_sentenses{$first_char}, $sentense);
}
        
# 输出索引信息
#dump_index();

# 统计top字出现在各句子的次数
my %char_in_sentense;
foreach my $char (keys(%top_chars)) {
    my $count = 0;
    foreach my $sentense (@sorted_sentenses) {
        $count++ if (index($sentense, $char) >= 0);
    }
    $char_in_sentense{$char} = $count;
}
my @sorted_char_in_sentenses = sort { $char_in_sentense{$a} <=> $char_in_sentense{$b} } keys %char_in_sentense;
#my $i = 0;
#foreach my $c (@sorted_char_in_sentenses) {
#    $i++;
#    print "$i:\t$c\t " . $char_in_sentense{$c} . "\n";
#}

# 以任意一个句子开头，接龙，首字不重复，直至不能接或首尾相连
my %used_first_chars;
my %used_sentenses;
my %used_chars;
my $char_count = 0;
my @sentense_queue;
my $max_char_count = 0;
my $max_depth = 0;
my $max_loop_depth = 0;
my $lastlog = time();
$| = 1; # disable IO buffer

sub traverse {
    my ($sentense, $depth, $progress) = @_;
    
    # 每分钟显示最新进度
    if (time() - $lastlog > 60) {
        print "sentense: $sentense, depth: $depth, count: $char_count, total progress: $progress\n";
        $lastlog = time();
    }
    
    return if ($used_sentenses{$sentense});
    $depth++;
    $used_sentenses{$sentense} = 1; # 标记句子已使用
    mark_chars($sentense);
    push(@sentense_queue, $sentense);
    
    if ($depth != scalar(@sentense_queue)) {
        print "$sentense: $depth\n";
        dump_result();
        exit;
    }
    
    if ($depth > $max_depth) {
        $max_depth = $depth;
        print "new depth: $depth\n";
        dump_result();
    }
    
    # 找到覆盖1000个字的方案（实际上100句职能覆盖500多个字）
    if ($char_count >= 1000) {
        if ($char_count >= 1000) {
            print "found one: $depth\n";
            dump_result();
            exit;
        }
    }
    
    if ($depth >= 100) {
        if ($char_count > $max_char_count) {
            $max_char_count = $char_count;
            print "new record: $max_char_count\n";
            dump_result();
        }
        #print "too deep: $char_count\n";
        pop(@sentense_queue);
        return;
    }
        
    my $last_char = substr($sentense, length($sentense) - 1, 1);
    
    #print $last_char;
    # 找到闭环
    if ($last_char eq substr($sentense_queue[0], 0, 1)) {
        if ($depth > $max_loop_depth) {
            $max_loop_depth = $depth;
            print "new loop: $depth\n";
            dump_result();
        }
        pop(@sentense_queue);
        return;
    }
    
    if ($char_sentenses{$last_char}) {
        my %child_sentenses;
        foreach my $s (@{$char_sentenses{$last_char}}) {
            if (!$used_sentenses{$s}) {
                my $count = count_weight($s);
                if ($count > 0) {
                    $child_sentenses{$s} = $count;
                }
            }
        }
      
        my @sorted_children = sort { $child_sentenses{$b} <=> $child_sentenses{$a} } keys %child_sentenses;
        my $i = 0;
        my $len = scalar(@sorted_children);
        foreach my $s (@sorted_children) {
            $i++;
            my $first_char = substr($s, 0, 1);
            if (!$used_first_chars{$first_char}) {
                $used_first_chars{$first_char} = 1;
                traverse($s, $depth, $progress * ($len - $i + 1));
                delete($used_sentenses{$s});
                delete($used_first_chars{$first_char});
                recount_chars();
            }
        }
        pop(@sentense_queue);
    } else {
        pop(@sentense_queue);
        #print "dead at $sentense: $depth; last char:$last_char\n";
    }
}

# 挑选第一个句子
my %child_sentenses;
foreach my $s (keys(%sentenses)) {
    my $count = count_weight($s);
    if ($count > 0) {
        $child_sentenses{$s} = $count;
    }
}
my @sorted_children = sort { $child_sentenses{$b} <=> $child_sentenses{$a} } keys %child_sentenses;
my $i = 0;
my $len = scalar(@sorted_children);
foreach my $s (@sorted_children) {
    $i++;
    my $first_char = substr($s, 0, 1);
    $used_first_chars{$first_char} = 1;
    traverse($s, 0, $len - $i + 1);
    delete($used_sentenses{$s});
    delete($used_first_chars{$first_char});
    recount_chars();
    print "depth: 0, count: $char_count, total progress: " . ($len - $i + 1) .
        ", progress: $i/$len, weight: " . $child_sentenses{$s} . "\n";
}

sub dump_chars() {
    my $i = 0;
    foreach my $char (@sorted_chars) {
        $i++;
        print "$i\t $char\t $chars{$char}\n";
    }
}

sub dump_sentenses() {
    my $i = 0;
    foreach my $sentense (@sorted_sentenses) {
        $i++;
        print "$i\t $sentense\t $sentenses{$sentense}\n";
    }
}

sub dump_index() {
    foreach my $char (keys(%char_sentenses)) {
        print "$char: " . scalar(@{$char_sentenses{$char}}) . "\n";
    }
}

sub mark_chars {
    my $sentense = shift;
    for (my $i = 0, my $len = length($sentense); $i < $len; $i++) {
        my $char = substr($sentense, $i, 1);
        if ($top_chars{$char} && !$used_chars{$char}) {
          $char_count++;
        }
        $used_chars{$char} = 1;
    }
}

sub count_weight {
    my $sentense = shift;
    my $weight = 0;
    for (my $i = 0, my $len = length($sentense); $i < $len; $i++) {
        my $char = substr($sentense, $i, 1);
        if ($top_chars{$char} && !$used_chars{$char}) {
            if ($char_in_sentense{$char}) {
                $weight += 1 / $char_in_sentense{$char};
            }
        }
    }
    return $weight;
}

sub count_unsed_chars {
    my $sentense = shift;
    my $count = 0;
    for (my $i = 0, my $len = length($sentense); $i < $len; $i++) {
        my $char = substr($sentense, $i, 1);
        if ($top_chars{$char} && !$used_chars{$char}) {
          $count++;
        }
    }
    return $count;
}

sub recount_chars {
    # 清空计算器
    undef %used_chars;
    $char_count = 0;
    
    foreach my $sentense (keys(%used_sentenses)) {
        mark_chars($sentense);
    }
} 

sub dump_result {
    print "chars: " . keys(%used_chars) . "\n";
    print "char_count: $char_count\n";
    my $i = 0;
    foreach my $sentense (@sentense_queue) {
        $i++;
        print "$i: $sentense\n";
    }
}
代码片段