Times.renameText.pl
From Ball State University Libraries Wiki
Used for the Muncie Times collection.
After OCRing the TIFF images, OmniPage is rather inflexible about file naming. Since CONTENTdm needs text file names that match the TIFF file names, all of the text files have to be renamed. This script accomplishes that.
The Script
#!/usr/bin/perl
#
$root = ".";
@dirs = &getFolderNames($root);
foreach $dir (@dirs) {
@tifs = &getTifNames("$root/$dir");
@txts = &getTxtNames("$root/$dir");
if (@tifs != @txts) {
print("Number of files does not match in $dir.\n");
next;
}
for ( $i = 0 ; $i < @tifs ; $i++ ) {
&renameTxt($txts[$i], $tifs[$i], "$root/$dir");
}
}
sub getFolderNames {#returns an array of the folders within the working folder
my($root) = @_;
my(@subdirs, $subdir, @folderNames);
opendir(ROOTDIR, $root) or die "Unable to open directory: $!";
@subdirs = readdir(ROOTDIR);
closedir(ROOTDIR);
foreach $subdir (@subdirs) {
if (-d "$root/$subdir" && $subdir !~ /^\./) {
push(@folderNames, $subdir);
}
}
@folderNames;
}
sub getTifNames { #returns an array of TIFF file names, minus the file extension
my($subdir) = @_;
my(@files, $file, @tifNames);
opendir(SUBDIR, $subdir) or die "Unable to open directory: $!";
@files = readdir(SUBDIR);
closedir(SUBDIR);
foreach $file (@files) {
if (-f "$subdir/$file" && $file =~ /(.*)\.tif/) {
push(@tifNames, $1);
if ($file !~ /MT_\d+_page\-\d\d\.tif/i) {
die("Erroneous file name in $subdir: $file.");
}
}
}
@tifNames;
}
sub getTxtNames { #returns an array of text file name, with the extension
my($subdir) = @_;
my (@files, $file, @txtNames);
opendir(SUBDIR, $subdir) or die "Unable to open directory: $!";
@files = readdir(SUBDIR);
closedir(SUBDIR);
foreach $file (@files) {
if (-f "$subdir/$file" && $file =~ /\.txt/) {
push(@txtNames, $file)
}
}
@txtNames;
}
sub renameTxt {#renames the text file to have the same name as the TIFF file, with appropriate extension
my($txt, $tif, $dir) = @_;
rename("$dir/$txt", "$dir/$tif.txt");
print("$txt\t-->\t$tif.txt\n");
}
