0byt3m1n1

Path: /data/applications/aps/owl/1.0-0/standard/htdocs/admin/tools/ [ Home ]
File: bigindex.pl
#!/usr/bin/perl
# ************************************************
# Script to index existing pdf and txt files
#
# Make sure to comment out the lines to empty the wordidx 
#  and searchidx tables if you do not want this done
#  
# Run this only once at the begining to upgrade an
# existing database.
#
# Change your database name and host name as needed
# in my $data_source = "DBI:mysql:intranet;host=localhost";
#
# Set the owl_fileDir and 
#  pdftotxt_location
#  rtftotxt_location
#  wordtotxt_location
#  cadtotxt_location
#  xlstotext_location TBD
#
# For Windows:
# Check shebang path to Perl:
# #!c:/perl/bin/perl.exe
# 
#
# Check path to pdftotext (C:\windows\system32 or C:\Winnt\system32 or anywhere in
# your path) as in OWL's Admin page
# (I normally have a c:\tools folder and put them all in there and add c:\tools to the
# system path)
# 
# ActivePerl doesn't always install all required DBI/DBD modules for Myqsl.
# Use Active Perl's PPM to check your installed packages.
# 
# IMPORTANT: If PDF file's security settings disallow copying text then the contents
# of the file will not be indexed.
# This is not an OWL or pdftotext bug but a drawback caused by the way PDF files are
# secured.
# ************************************************
# Changed by Damiaan Peeters 2005-04-02
# VFJ - http://www.vfj.be
# * Error with spaces in filenames on linux.
# * Added Lines to automaticly Empty wordidx and searchidx tables
# ************************************************

use DBI;
use strict;
use File::Copy;
use File::Basename;

# EDIT - your database info
#  owldms - the owl mysql database name
#  localhost - location of mysql server
#  username/password - the owl mysql user/pass
#
my $data_source = "DBI:mysql:owldms;host=localhost";
my $username = "owldms"; 
my $password = "your_mysql_owl_password";

#EDIT - point this to the dir in which the root Documents lives
#my $owl_FileDir = "c:\\program files\\apache group\\apache\\htdocs\\intranet";
my $owl_FileDir = "/home/owldms";

#EDIT - your text parsers, if you have them
#my $pdftotxt_location = "c:\pdftotext.exe";
my $pdftotxt_location = "/usr/bin/pdftotext";
my $wordtotxt_location = "/usr/bin/antiword";
my $rtftotxt_location = "/usr/bin/unrtf";
my $cad2text_location = "/usr/local/bin/cad2text";

#EDIT - options
my $bEmptyIndexTables = 0;	# empty the index tables before we start
my $deletefileafter=1;		# delete tmp txt files created by the parsers

#
# NO MORE EDITS
#
my $dbh = DBI->connect( $data_source, $username, $password,
	{RaiseError=>1,AutoCommit=>0})
   or die "Can't connect to $data_source: \n";

my $owlfileid; 

# empty the index tables if requested
if ($bEmptyIndexTables)
{
	print ("EMPTY table wordidx\n");
	my $query=$dbh->prepare(q{delete from wordidx}) or
		die "Cant prepare delete everything from wordidx\n";
	my $rc = $query->execute or die "Cant execute delete from wordidx\n";

	print ("EMPTY table searchidx\n");
	my $query=$dbh->prepare(q{delete from searchidx}) or
		die "Cant prepare delete everything from searchidx\n";
	my $rc = $query->execute or die "Cant execute delete from searchidx\n";
}

#first, lets read in the word index data, we reuse this over and over
my %words=();
my %wordindex=();
my $nextwordindex=1;

my $getwrdidx = $dbh->prepare(q{select wordid,word from wordidx}) or 
  die "Cant get word index from Owl database to start off with\n";

my $rc = $getwrdidx->execute or die "Cant execute word grab from db\n";

my $wordid;
my $word;

$nextwordindex=1;

while(($wordid,$word) = $getwrdidx->fetchrow_array)
{
  $wordindex{$word} = $wordid;

  if ($wordid > $nextwordindex)
  {
    $nextwordindex=$wordid;
  }
}

my $pidcount=0;
my $readallfileinfo = $dbh->prepare(q{select parent,id,filename from files});
my $ex=$readallfileinfo->execute;
my @pidlist,my @fidlist,my @efnames;

  while (($pidlist[$pidcount],$fidlist[$pidcount],$efnames[$pidcount]) = $readallfileinfo->fetchrow_array)
  {
    $pidcount++;
  }

  print "pidcount = $pidcount\n";
  my $i=0;
  for($i=0;$i<$pidcount;$i++) 
  {
    if ($pidlist[$i])
    {  #Don't index a owlfileid if its already in the index
  	  my $send="select * from searchidx where owlfileid = $fidlist[$i]";
	  my $chkdbl=$dbh->prepare($send); 
	  my $tex=$chkdbl->execute or die "blah outch";

      if ($chkdbl->rows==0) 
      {
        my $fileid=$pidlist[$i]; 
        my $realfileid=$fidlist[$i]; 
        my $filepath=$owl_FileDir."/".get_dirpathfs($fileid)."/".$efnames[$i];
		
	# Index files we have parsers for
    # Add other types here and a parser below in IndexAFile()
	if (((lc $filepath)=~/\.txt/) 
		|| ((lc $filepath)=~/\.log/) 
		|| ((lc $filepath)=~/\.pl/) 
		|| ((lc $filepath)=~/\.htm/) 
		|| ((lc $filepath)=~/\.html/) 
		|| ((lc $filepath)=~/\.c/) 
		|| ((lc $filepath)=~/\.pdf/) 
		|| ((lc $filepath)=~/\.doc/) 
		# || ((lc $filepath)=~/\.dwg/) 
		# || ((lc $filepath)=~/\.dxf/) 
		|| ((lc $filepath)=~/\.rtf/)) 
	{# we know how to index this file type
		IndexAFile($filepath,$realfileid);
	}
      }
    }
  }  

 $dbh->commit;
 $dbh->disconnect;



#-----------------------------------------------------------------------------
#IndexAFile Takes a filename (with full path), and a owl file id number
#If the file type is a type (extension) we know, it converts it temporarily 
# to a .text file and indexes it.  If its a txt file it just indexes it.

sub IndexAFile 
{
my $base;
my $dir;
my $filename=$_[0];
my $fileidnum=$_[1];
my $tmp="/tmp/$fileidnum.text";

print ("INDEX: $filename\n");

# **********************
# DPE 
# if there are spaces in the name, the open command will crash!
# so use $tmp as filename instead of $filename.'.text'
# **********************
if ((lc $filename)=~/\.pdf/)  #pdf file?
{
	`$pdftotxt_location "$filename" "$tmp"`;
	#$filename=$filename.'.text';
	$filename=$tmp;
	$deletefileafter=0;
}
elsif ((lc $filename)=~/\.doc/)  #doc file?
{
        `$wordtotxt_location "$filename" > "$tmp"`;
        #$filename=$filename.'.text';
	$filename=$tmp;
	$deletefileafter=0;
}
elsif ((lc $filename)=~/\.rtf/)  #Rich Text file?
{
        `$rtftotxt_location "$filename" > "$tmp"`;
        #$filename=$filename.'.text';
	$filename=$tmp;
	$deletefileafter=0;
}
elsif (( (lc $filename)=~/\.dxf/) || ( (lc $filename)=~/\.dwg/))  # AutoCAD file
{
        `$cad2text_location "$filename" > "$tmp"`;
        #$filename=$filename.'.text';
	$filename=$tmp;
	$deletefileafter=0;
}  
elsif ( ((lc $filename)=~/txt/) 
	|| ((lc $filename)=~/\.log/) 
	|| ((lc $filename)=~/\.pl/) 
	|| ((lc $filename)=~/\.htm/) 
	|| ((lc $filename)=~/\.html/) 
	|| ((lc $filename)=~/\.c/)) 
{
	copy("$filename", "$tmp"); #cp to tmp
}
else
{
	print "WARNING: Cannot index unknown file type: $filename\n";
}


open(THEINFILE,$tmp); # or die PARMS " failed open";

my %words=();
my $w;

print "Working ";
while(<THEINFILE>)
{
  print ".";
  chop();
  while(/([a-zA-Z][A-Za-z\']*)/g)
  {
    $w=lc($1);
    $words{$w}++;
	if ($words{$w}==1)
	{
	  if ($wordindex{$w})
	  {
	  	my $addsrchidx= $dbh->prepare(q{insert into searchidx(wordid,owlfileid) VALUES(?,?)});
	  	$addsrchidx->execute($wordindex{$w},$fileidnum);
	  }
	  else
	  {
	  	$wordindex{$w}=$nextwordindex;
	  	my $addsrchidx= $dbh->prepare(q{insert into searchidx(wordid,owlfileid) VALUES(?,?)});
	  	$addsrchidx->execute($wordindex{$w},$fileidnum);
	  	my $addwrdidx = $dbh->prepare(q{insert into wordidx(wordid,word) VALUES (?,?)});
	  	$addwrdidx->execute($nextwordindex,$w);
	  	$nextwordindex++;
	  }
	}
  }
}# end of while
  print "finished\n";
  close(THEINFILE);
  if ($deletefileafter==1)  #is filename a temp file created just for indexing?
  {
	 unlink($tmp);  #delete
  }
} #end of indexafile subroutine


#-----------------------------------------------------------------------------
#fid_to_name takes a parent id passed in and returns the name of that file
sub fid_to_name #($parent)
{
	my $parent=$_[0];
	my $send = "select name from folders where id = $parent";
	my $tmp = $dbh->prepare($send); #q{select name from folders where id = $parent});
	$tmp->execute;

	my $name;
	while(($name) = $tmp->fetchrow_array)
	{
		return $name;
	}
}



#-----------------------------------------------------------------------------
#get_dirpathfs : Get Directory Path Forward Slash
#passed a fileid it returns a string with the directory path to get to the file
sub get_dirpathfs
{
  my $parent = $_[0];
  my $name = fid_to_name($parent);
  my $navbar = "$name";
  my $new = $parent;
  while ($new != "1")
  {
	my $send="select parent from folders where id = $new";
	my $dp=$dbh->prepare($send); #q{select parent from folders where id = $new});
	$dp->execute;

	my $newparentid=$dp->fetchrow_array;

    if($newparentid == "")
	{
      last;
	}
    $name = fid_to_name($newparentid);
    $navbar = "$name/" . $navbar;
    $new = $newparentid;
  }
  return $navbar;
}