tok2WikiFormat.pl
A tok2WikiFormat.pl is a Perl-based Token-to-Wiki program.
- See: Perl Program, Tokenization System.
References
#!/usr/bin/perl use strict ; my $PROGRAM = "_tokToWikiFormat.pl" ; my $CONTACT = '_tokToWikiFormat@at@gabormelli.com' ; my $VERSION = '1.1' ; my $VERSIONDATE = "2009.11.11" ; ######################################################################################### # == SYNOPSIS: # * Convert tokenFile format to wikiFile format. # # == USAGE # * Input: tokenFile # * Input: SynonymToConceptId (hardcoded) # # == TODO # * support the reading of a header (with column names) ######################################################################################### #use warnings ; # PACKAGES use lib '..' ; use lib './lib' ; use lib '../lib' ; use TABularDataFileMgmt ; use TGQueryDataMgmt ; use Getopt::Long; ######################################################################################### # GLOBAL SCALARS my $debug = 2 ; $TABularDataFileMgmt::debug = $debug ; TABularDataFileMgmt::settimestamp() ; my $runningSeconds_=TABularDataFileMgmt::getrunningseconds() ; my $SynonymToConceptIdFile="SynonymToConceptId.dat" ; my $removeDocumentId ; # GLOBAL ARRAYS my @Token ; my @Document ; my @Sentence ; # Some standard column configurations my @TokFileAttrsType ; $TokFileAttrsType[1] ="Token POS Parse ConceptType ConceptId OntologyId BaseVerb SRL1 SRL2 SRL3 SRL4 SRL5 SRL6 SRL7 SRL8" ; $TokFileAttrsType[2] ="DocumentExtId Token POS Parse ConceptType ConceptId OntologyId BaseVerb SRL1 SRL2 SRL3 SRL4 SRL5 SRL6 SRL7 SRL8" ; $TokFileAttrsType[3] ="Token BaseWord POS Parse ConceptType ConceptId OntologyId BaseVerb SRL1 SRL2 SRL3 SRL4 SRL5 SRL6 SRL7 SRL8" ; $TokFileAttrsType[4] ="DocumentExtId Token BaseWord POS Parse ConceptType ConceptId OntologyId BaseVerb SRL1 SRL2 SRL3 SRL4 SRL5 SRL6 SRL7 SRL8" ; $TokFileAttrsType[5] ="Token POS Parse BIO ConceptType ConceptId" ; $TokFileAttrsType[7] ="Token POS Parse Oe1 Oe2 Og2 ChrCntCat CapsCnt NumCnt SpeChrCnt BIO" ; $TokFileAttrsType[8] ="Token POS Parse Oe1 Oe2 Og2 ChrCntCat CapsCnt NumCnt SpeChrCnt BIO ConceptType ConceptId" ; $TokFileAttrsType[9] ="Token POS Oe1 Oe2 Og2 ChrCntCat CapsCnt NumCnt SpeChrCnt BIO BIOp" ; $TokFileAttrsType[10]="Token POS Oe1 Oe2 Og2 ChrCntCat CapsCnt NumCnt SpeChrCnt BIO ConceptType ConceptId" ; # Token input file description my $inputTokFileAttrs ; # will eventually hold the attribute names for the (token) input file my @InputTokFileAttrs ; my $inputTokFileAttrsTypeId = 10 ; # the default configuration my $inputTokFile = "-" ; my $outputFile = "-" ; my $verbose = 0 ; my $help = 0 ; GetOptions ( 'f|inputTokFile=s' => \$inputTokFile, 'o|outputFile:s' => \$outputFile, 'removeDocumentId=s' => \$removeDocumentId, 'inputTokFileAttrsTypeId:s' => \$inputTokFileAttrsTypeId , 'inputTokFileAttrs:s' => \$inputTokFileAttrs , 'd|debug=i' => \$debug, 'v|verbose+' => \$verbose, 'h|help' => \$help ); my $USAGE ; $USAGE .= "USAGE:\n" ; $USAGE .= " Program info: Version: $VERSION($VERSIONDATE) Contact: $CONTACT\n" ; $USAGE .= " Example: ./$PROGRAM -r=svml -f=All -d=1 -p train\n" ; $USAGE .= " PARAMS:\n" ; $USAGE .= " inputTokFile:s def[$inputTokFile]\n" ; $USAGE .= " inputTokFileAttrsTypeId:s def[$inputTokFileAttrsTypeId]\n" ; $USAGE .= " inputTokFileAttrs:s def[$inputTokFileAttrs]\n" ; $USAGE .= " outputFile:s def[$outputFile]\n" ; $USAGE .= " d|debug:i def[$debug]\n" ; $USAGE .= " v|verbose+\n" ; $USAGE .= " h|help\n" ; if ($help) { print $USAGE ; exit ; } ######################################################################################### # Process inputed attributes $debug=$debug+$verbose ; if ($debug>0) { print "DEBUG: Debugging enabled and set to level: $debug.\n" ; } # Test the inputTokFile parameter if (not defined($inputTokFile) ) { print $USAGE ; die ("ERROR a0djla: requires that --inputTokFile parameter be provided.\n\n") ; } if (-d $inputTokFile ) { die("\nERROR aolkdj: The InputFile parameter must be a file not a directory [$inputTokFile].\n") ; } if (not open (INPUTFILE, “< $inputTokFile") ) { die("\nERROR oxlkkd: could not open input file [$inputTokFile].\n") ; } close(INPUTFILE) ; # leave the opening to the subroutine # Test the outputFile parameter my $outputFileHandle ; if (defined($outputFile) and not ($outputFile =~ /\-/)) { if (-d $outputFile ) { die("\nERROR lasjii: The OutputFile parameter must be a file not a directory [$outputFile].\n") ; } if (not open ($outputFileHandle, ">", $outputFile) ) { die("\nERROR aanxks: could not open output file [$outputFile].\n") ; } } if (not defined $inputTokFileAttrs) { $inputTokFileAttrs = $TokFileAttrsType[$inputTokFileAttrsTypeId] ; } @InputTokFileAttrs = split ('\s+', $inputTokFileAttrs) ; # array of attribute names print "DEBUG: inputTokFileAttrs=[@InputTokFileAttrs]\n" if $debug>=3 ; ############## # not using TAB module ... yet print "DEBUG: Read in the SynonymToConceptIdFile[$SynonymToConceptIdFile]\n" if $debug>=1 ; my %ConceptIdToPreferredName ; if (-e $SynonymToConceptIdFile ) { open FILEH, “<$SynonymToConceptIdFile" or die "ERROR: Could not open [$SynonymToConceptIdFile] !*\n" ; while (<FILEH>) { print "." if $debug>=4 ; my @R_ = split /\t/; ##ConceptName ConceptId Preferred #- 5302 0 #1-Nearest Neighbor Range Search Task 4080 1 next if $R_[2] != "1" ; # We only need the preferred name my $conceptId_ = $R_[1] ; my $conceptPrefName_ = $R_[0] ; $ConceptIdToPreferredName{$conceptId_} = $conceptPrefName_ ; print "DEBUG: ConceptIdToPreferredName{$conceptId_} = $conceptPrefName_\n" if $debug>=4 ; } } else { print "DEBUG: not found [$SynonymToConceptIdFile]\n" if $debug>=0 ; } print "\n" if $debug>=4 ; ################################################################ ################################################################ # Read-in the token input file and populate some prelim structures $runningSeconds_=TABularDataFileMgmt::getrunningseconds() ; print "DEBUG: [$runningSeconds_]s READ IN OUR DOCUMENT [$inputTokFile] with [" if $debug>=1 ; TABularDataFileMgmt::ReadTabularFileIntoArrayOfHashes($inputTokFile, \@Token, \@InputTokFileAttrs) ; print $#Token . “] records.\n" if $debug>=1 ; if ($debug>=3) { print "\nTokens :\n" ; TABularDataFileMgmt::WriteArrayOfHashesToTabularFile("-", \@Token, \@InputTokFileAttrs ); } # Populate the @Sentence structure $runningSeconds_=TABularDataFileMgmt::getrunningseconds() ; print "DEBUG: [$runningSeconds_]secs EXTRACT THE DOCUMENT AND SENTENCE INFORMATION\n" if $debug>=2 ; # This is an overkill. TGQueryDataMgmt::PopulateDocumentSentence(\@Token, \@Document, \@Sentence) ; if ($debug>=2) { print "DEBUG: COMPLETED: Documents=[" ; print 1+$#Document; print "] Sentences=[" ; print 1+$#Sentence; print "]\n" ; } ############## my $rawRestiched = "" ; # will contain the initial raw restiched results. my $documentId ; # currently just for debugging my $docSectionFlag=0 ; # special handling when a documentId is encountered. $docSectionFlag=3 if $removeDocumentId ; my $prevBioTag='O' ; my $prevConceptId ; my $cMAnchorText = "" ; my $rawTokenId_ = 0 ; print "DEBUG: PROCESS the inputTokFile\n" if $debug>=1 ; for my $tokenId_ (0 .. $#Token) { $rawTokenId_++ ; my $token_ = $Token[$tokenId_]{Token} ; my $bioTag_ = $Token[$tokenId_]{BIO} ; my $conceptIdTxt_ = $Token[$tokenId_]{ConceptId} ; my $conceptId_ = $conceptIdTxt_ ; $conceptId_ =~ s/[\(\)\s\-]//g ; undef $conceptId_ if ($conceptId_ =~ m/UNLINKED/) ; print "DEBUG: token[$token_] bioTag[$bioTag_] conceptId[$conceptId_] cMAnchorText[$cMAnchorText] prevConceptId[$prevConceptId] docSectionFlag[$docSectionFlag]\n" if $debug>=2 ; my $nextBioTag_ = $Token[$tokenId_+1]{BIO} ; ######## HANDLE DOCUMENT IDENTIFICATION SECTION ######### if ($token_ =~ m/^(\d\d\d\d_[^\s]+)/ or $token_ =~ m/^(10.\d\d\d\d[^\s]+)/) { # the pattern to detect a new document and extract the document identifier $rawRestiched .= "\n$token_\t"; $documentId = $1 ; $documentId =~ s/\//_/g ; print "DEBUG: Found document start [$documentId]\n" if $debug>=3; $docSectionFlag=2 ; next ; } # test whether we need to skip the document section. if ($removeDocumentId and $docSectionFlag) { if ($docSectionFlag > 1) { $docSectionFlag = 1 if ($token_ =~ /[\s]*/ and $token_ =~ /[^\w\d]/) ; next ; } if ($token_ =~ /[^\s]/) { # Look for the first content token. $docSectionFlag=0 ; print "DEBUG: In document content section\n" if $debug>=3; } else { next ; } } ############################################################ # Mark the end of a concept mention (on the previous token). if ($bioTag_ =~ m/[^I]/ and $prevBioTag =~ m/[^O]/) { # if we are not in a continued span (bioTag==I) and the previous one is not in a span (prevBioTag==O). print "DEBUG: End of a concept mention\n" if $debug>=3 ; my $conceptName_ ; if ($prevConceptId) { $conceptName_ = $ConceptIdToPreferredName{$prevConceptId} ; print "DEBUG: prevConceptId[$prevConceptId] => prevConceptName[$conceptName_]\n" if $debug>=2 ; die "ERROR: unrecog concept name for conceptId[$prevConceptId]\n" if (not defined $conceptName_) ; undef $prevConceptId ; } if (defined $conceptName_) { $rawRestiched .= $conceptName_ . “|" . $cMAnchorText } else { $rawRestiched .= $cMAnchorText } $rawRestiched .= "]]” undef $cMAnchorText ; } # Can now commit to a conceptId in the current token (if present). $prevConceptId = $conceptId_ if ($conceptId_ =~ m/\d/ and not $conceptId_ =~ m/\w/) ; if ($bioTag_ =~ m/B/) { # Mark the start of a concept mention $rawRestiched .= " [[" if $bioTag_ =~ m/B/ ; $prevConceptId = $conceptId_ if ($conceptId_ =~ m/\d/) ; $cMAnchorText = $token_ ; print "DEBUG: mark the start of a concept mention (token[$token_]/bioTag=[$bioTag_])\n" if $debug>=4 ; } elsif ($bioTag_ =~ m/O/) { # Commit the token (insert appropriate spacing) # if (not $cMAnchorText ) { print "DEBUG: commit the token [$token_] (not [$cMAnchorText])\n" if $debug>=3 ; my $space_ = " " ; $rawRestiched .= "$space_$token_" } elsif ($bioTag_ =~ m/I/) { $cMAnchorText .= " $token_" if defined $cMAnchorText ; my $space_ = " " ; $rawRestiched .= $space_ ; print "DEBUG: insert the space between [$cMAnchorText])\n" if $debug>=3 ; } elsif (not defined $bioTag_) { # Mark an end of sentence. $rawRestiched .= " <\/s> " ; print "DEBUG: mark the end of a sentence (token[$token_]/bioTag=[$bioTag_])\n" if $debug>=3 ; } else { die "ERROR(laks0hs): bioTag[$bioTag_]\n" ; } $prevBioTag=$bioTag_ ; } #################################### $_ = $rawRestiched ; # recreate the typographical look&feel (e.g. periods, commas, ...) s/ +/ /g ; s/ \. <\/s>/. <\/s>/g ; s/[ ]+([\,:;"\)])/$1/g ; s/(\() /$1/g ; s/\[\ [[]*?<([\/]*?)i>/<$1i>[[/gi ; s/<([\/])*?i>[ ]*?\]\]/]]<$1i>/gi ; s/\[\ [[]+?/[[/g ; s/[ ]+?\]\]/]]/g ; s/\[\ [[]*?\]\]//g ; # reproduce the items that were reformated for convinience (e.g. _BR_ Yahoo_) s/_BR_/<BR>/g ; s/_sq_/``/g ; s/_eq_/''/g ; s/_#_/ # /g ; s/_7_(.+?)_7_/&#$1;/g ; s/Yahoo_/Yahoo!/g ; # fixup some simple mistakes # remove brackets around zero or one characters s/\[\[(.{0,1})\]\]/$1/g ; # ?? (stale?) # append a <tab> after s/2008_(\d+?)\. [\s]+?<\/s>[\s]*?/\n2008_$1\t/g ; if (defined $outputFileHandle) { print $outputFileHandle $_ ; } else { print STDOUT $_ ; }