package TextMining; our $VERSION = 1.0; use Exporter; our @ISA = ('Exporter'); our @EXPORT = 'process_data_file'; use strict; use locale; our $_DEBUG = 1; sub process_data_file { # processes the file with texts and produces their vector representations according parameters my ($config, $filename) = @_; # processing the configuration file and getting the algorithm parameters process_config_file($config); open F, "<:utf8", $filename or die $!; # deleting the dictionary (e.g. during processing multiple inputs) undef %TextMining::dictionary; if ($TextMining::dictionary_file) { # use only words from given dictionary TextMining::read_dictionary(); } # for storing names of classes my %classes; my @texts; my @classes; debug("input:\n"); while () { # line contains CLASS \t TEXT \n chomp; my ($class, $text) = split "\t", uc $_; debug("\tline $.\n") unless $. % 1000; next if @TextMining::processed_classes and not grep {$_ eq $class} @TextMining::processed_classes; TextMining::remove_tags_and_entities($text); TextMining::remove_characters($text); TextMining::remove_short_words($text, $TextMining::min_word_length) if $TextMining::min_word_length; # removing leading, trailing and multiple spaces $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g; my @words = split / /, $text; unless ($TextMining::dictionary_file) { # leave all words and create the dictionary $TextMining::dictionary{$_}++ for @words; } else { # remove words that are not in the dictionary # and count allowed words # temporarily contains only allowed words from the text my @allowed_words; for (@words) { if (exists $TextMining::dictionary{$_}) { push @allowed_words, $_; $TextMining::dictionary{$_}++; } } @words = @allowed_words; # skipping texts with no (allowed) words next unless @words; } # storing the text and associated class push @texts, [@words]; push @classes, $class; # storing information about existing class $classes{$class}++; } debug("\tlast line: $.\n"); close F; debug("data file is read\n"); # remove words with low frequency from the dictionary if ($TextMining::min_word_frequency and $TextMining::min_word_frequency > 1) { # deleting words with low frequency for (keys %TextMining::dictionary) { delete $TextMining::dictionary{$_} if $TextMining::dictionary{$_} < $TextMining::min_word_frequency; } # deleting words with low frequency from texts for my $text (@texts) { for my $i (reverse 0..$#$text) { splice @$text, $i, 1 unless exists $TextMining::dictionary{$text->[$i]}; } } # deleting texts (and their associated classes) with no words for my $i (reverse 0..$#texts) { unless (@{$texts[$i]}) { splice @texts, $i, 1; $classes{$classes[$i]}--; # decresging the number of texts in the class of removed text splice @classes, $i, 1; } } debug("words with low frequency deleted\n"); } # write the dictionary if necessary unless ($TextMining::dictionary_file) { TextMining::write_dictionary(); debug("dictionary written\n"); } # printing the statistics open S, ">>stat.text" or die $!; print S "class: $_ texts: $classes{$_}\n" for sort keys %classes; print S scalar keys %TextMining::dictionary, " words\n"; print S "\n"; close S; # if only the dictionary should be created return if $TextMining::dictionary_only; # transforming the dictionary into aplhabetically ordered list my @dictionary = sort keys %TextMining::dictionary; my $number_of_words = scalar @dictionary; debug("number words of in dictionary: $number_of_words\n"); # order of words in the vector my %word_order; for my $i (0..$#dictionary) { $word_order{$dictionary[$i]} = $i } if ($TextMining::frequency_type eq 'TF-IDF') { # calculating document frequencies for words for (@texts) { # getting the list of unique words in the text my %words_in_text; map {$words_in_text{$_} = 1} @$_; # increasing document frequency for all words $TextMining::document_frequencies{$_}++ for keys %words_in_text; # increasing the number of texts $TextMining::number_of_documents++; } debug("document frequencies for words calculated\n"); } if ($TextMining::output_format eq 'ARFF') { # @RELATION name # @ATTRIBUTE name TYPE # @DATA # v1,v2,v3...,vn open O, ">:utf8", "vectors.arff" or die $!; print O "\@RELATION data\n\n"; print O "\@ATTRIBUTE $_ NUMERIC\n" for @dictionary; print O "\@ATTRIBUTE _CLASS_ {", join(',', keys %classes), "}\n\n\@DATA\n"; } elsif ($TextMining::output_format eq 'C5') { open O, ">:utf8", "vectors.names"; print O "\|classes\n", join(',', keys %classes), "\n\n"; print O "$_: continuous.\n" for @dictionary; close O; open O, ">", "vectors.data"; } elsif ($TextMining::output_format eq 'GENERIC') { open O, ">:utf8", "vectors.text" or die $!; print O join("$TextMining::delimiter", @dictionary, "_CLASS_"); print O "$TextMining::delimiter",'_NZ_' if $TextMining::print_number_of_nonzero_items; print O "\n"; } debug("output:\n"); # output in more files my $file_number = 1; for my $i (0..$#texts) { # transforming words into a hash (word/term frequency) my %words; $words{$_}++ for @{$texts[$i]}; # the vector representing the text contains only zeroes at the beginning my @_out = (0) x $number_of_words; # number of non zero items in the vector my $nonzero = scalar keys %words; # each existing word changes zero to non zero value for my $word (keys %words) { if ($TextMining::frequency_type eq 'TP') { $_out[$word_order{$word}] = 1; } elsif ($TextMining::frequency_type eq 'TF') { $_out[$word_order{$word}] = $words{$word}; } elsif ($TextMining::frequency_type eq 'TF-IDF') { $_out[$word_order{$word}] = sprintf '%.5f', ($words{$word} / eval join '+', values %words) # term frequency / number of words in a text * log($TextMining::number_of_documents / $TextMining::document_frequencies{$word}); # log (number of texts / document frequency of the word) } } if ($TextMining::output_format eq 'ARFF' or $TextMining::output_format eq 'C5') { print O join(",", @_out, $classes[$i]); print O "\n"; } elsif ($TextMining::output_format eq 'GENERIC') { print O join("$TextMining::delimiter", @_out, $classes[$i]); print O "$TextMining::delimiter$nonzero" if $TextMining::print_number_of_nonzero_items; print O "\n"; } if ($TextMining::max_lines_in_output_file and not (($i+1) % $TextMining::max_lines_in_output_file)) { # maximal number of lines in output file was reached, new file is opened open O, ">", "vector$file_number.data"; $file_number++; } print "\tline $i\n" unless $i % 1000; } close O; } sub remove_tags_and_entities { # removes tags $_[0] =~ s/<[^>]*>/ /gs; # removes entities $_[0] =~ s/\&[^;]*;/ /gs; } sub remove_characters { # removes given characters $_[0] =~ s/[\P{L}0-9_]/ /g; # removes all non-letters } sub remove_short_words { # removes words with low number of characters my (undef, $min) = @_; $min--; $_[0] =~ s/\b\w{1,$min}\b/ /g if $min > 0; } sub write_dictionary { # creating a file with one line, containig all words open(D, ">:utf8", "dictionary.text") or die $!; debug("dictionary size: ". scalar(keys %TextMining::dictionary)."\n"); print D join ",", sort keys %TextMining::dictionary; close D; # creating a file containig all words and their frequencies, sorted by frequencies open(D, ">:utf8", "dictionary_frequencies.text") or die $!; print D join "\n", map {"$_\t$TextMining::dictionary{$_}"} sort {$TextMining::dictionary{$b} <=> $TextMining::dictionary{$a}} keys %TextMining::dictionary; } sub read_dictionary { # reads the file with a dictionary and stores the allowed words open D, "<:utf8", $TextMining::dictionary_file or die "Can't open dictionary file\n"; $_ = ; chomp; %TextMining::dictionary = map {$_ => 0} split /,/; } sub debug { warn $_[0] if $TextMining::_DEBUG; } sub process_config_file { # processing the configuration file my $file = shift; open F, $file or die "Can't open configuration file $file\n"; while () { chomp; next if /^#/ or /^\s*$/; s/#.*//; if (/^frequency_type\s*(\S+)/ ) { $TextMining::frequency_type = uc $1; }; if (/^dictionary_only\s*(\S+)/ ) { $TextMining::dictionary_only = 1 if uc $1 eq 'YES'; }; if (/^min_word_length\s*(\S+)/ ) { $TextMining::min_word_length = $1; }; if (/^min_word_frequency\s*(\S+)/ ) { $TextMining::min_word_frequency = $1; }; if (/^processed_classes\s*(.+)/ ) { @TextMining::processed_classes = split /\s+/, $1; }; if (/^print_number_of_nonzero_items\s*(\S+)/ ) { $TextMining::print_number_of_nonzero_items = 1 if uc $1 eq 'YES'; } if (/^delimiter\s*(\S+)/ ) { $TextMining::delimiter = $1; ; $TextMining::delimiter =~ s/\\t/\t/; } if (/^output_format\s*(\S+)/ ) { $TextMining::output_format = uc $1; }; if (/^max_lines_in_output_file\s*(\S+)/) { $TextMining::max_lines_in_output_file = $1; } if (/^dictionary_file\s*(\S+)/ ) { $TextMining::dictionary_file = $1; }; } close F; # TODO: checking mandatory parameters and the correctness of the parameters } 1;