#! /usr/bin/perl #================================================================ # htmldoctotsv # Generate TSV data from HTML documents #================================================================ use strict; use warnings; use Encode; use Cwd 'realpath'; my $path = '.'; my $mode_kv = 0; my $mode_hex = 0; for(my $i = 0; $i < scalar(@ARGV); $i++){ my $arg = $ARGV[$i]; if($arg =~ /^-/){ if($arg eq '-kv'){ $mode_kv = 1; } elsif($arg eq '-x'){ $mode_hex = 1; } } else { $path = $arg; } } sub trimhtml { my $text = shift; $text =~ s/<[^>]*>/ /g; $text =~ s/<//g; $text =~ s/"/"/g; $text =~ s/ / /g; $text =~ s/&/\&/g; $text =~ s/\s+/ /g; $text =~ s/^ *//; $text =~ s/ *$//; return $text; } $ENV{LANG} = "C"; $ENV{LC_ALL} = "C"; open(my $lfh, "find $path -type f -iregex '.*\.html?' -print | sort |") || die("could not open"); my $id = 0; while(defined($path = <$lfh>)){ chomp($path); $path = realpath($path); next if(!defined($path)); my @stat = stat($path); next if(scalar(@stat) < 10); my $mtime = $stat[9]; open(my $ifh, "<$path") || next; my $encname = "UTF-8"; my @lines; while(defined(my $line = <$ifh>)){ push(@lines, $line); if($line =~ / 0); } } my $text = join('', @lines); $text = encode("UTF-8", decode($encname, $text)) if($encname ne "UTF-8"); $text =~ s///is; my $title = ""; if($text =~ /]*>[^<]*<\/title>/i){ $title = $text; $title =~ s/.*]*>([^<]*)<\/title>.*/$1/is; $title = trimhtml($title); } $text =~ s/.*]*>(.*)<\/body>.*/$1/is; $text =~ s/]*>.*?<\/style>//is; $text =~ s/]*>.*?<\/script>//is; $text = trimhtml($text); next if(length($title) < 1 && length($text) < 1); $id++; my $key = $mode_hex ? sprintf("%X", $id) : $id; printf STDERR ("%d: saving: %s\n", $id, $path); if($mode_kv){ $text = $title . " " . $text if(length($title) > 0); printf("%s\t%s\n", $key, $text); } else { printf("%s", $key); printf("\turl\t%s", $path); printf("\tsize\t%s", length($text)); printf("\tmtime\t%s", $mtime); printf("\ttitle\t%s", $title) if(length($title) > 0); printf("\tbody\t%s", $text); printf("\n"); } close($ifh); } close($lfh); # END OF FILE