#!/usr/local/bin/perl # # file: ex3480 # auth: Brad Burdick # desc: Extract SEC EDGAR, U.S. PTO APS/Full-Text, and U.S. PTO Trademark # image and text data files from an IBM 3480 1/2" cartridge tape. # # usage: ex3480 [-E|-P|[-T image|text]] [-d data_dir] [-f data_file] # [-t tape_device] [-v] # # E = Edgar data # P = Patent data # T = Trademark data # ########################################################################## # Copyright (c) 1994, 1995 Internet Multicasting Service # # The SEC EDGAR Level 1 Dissemination processing software ("software") # was developed by the Internet Multicasting Service and may # be used for academic, research, government, and internal business # purposes without charge. You may not resell this code or include it # in a product that you are selling without prior permission of the # Internet Multicasting Service. # # This software is provided ``as is'', without express or implied # warranty, and with no support nor obligation to assist in its # use, correction, modification or enhancement. We assume no liability # with respect to the infringement of copyrights, trade secrets, or any # patents, and are not responsible for consequential damages. Proper # use of the software is entirely the responsibility of the user. ########################################################################## # eval 'exec /usr/bin/perl -s $0 ${1+"$@"}' if 0; # who am i? ($prog = $0) =~ s#.*/##; # where we find our local libraries push(@INC, '/usr/local/ims/lib'); # for processing command line options require 'getopts.pl'; # for date manipulation routines require 'edgar-date.pl'; # process command line options, if any &Getopts('EPT:d:f:t:v'); # current date info @today = &edgar_date; # type of processing to perform $do_edgar = defined($opt_E); $do_patent = defined($opt_P); $do_trademark = defined($opt_T); # type of processing specified? if (! $do_edgar && ! $do_patent && ! $do_trademark) { die "$prog: no processing type specified\n"; } # don't allow more than one type of processing... if ($do_edgar && ($do_patent || $do_trademark)) { die "$prog: only one type of processing allowed\n"; } # verbose output? $verbose = defined($opt_v); # where to place data file $datadir = defined($opt_d) ? "$opt_d" : $do_edgar ? "/in/edgar" : $do_patent ? "/in/patent" : $do_trademark ? "/in/trademark" : "."; # default to the current date as output data file name $file = defined($opt_f) ? "$opt_f" : sprintf("%02d%02d%02d", $today[5], $today[4]+1, $today[3]); # patent data files may come on multiple tapes if ($do_patent) { $file .= '01'; while (-e "$datadir/$file") { $file++; } } # tape device to use - default is the first no-rewind tape device $tape = defined($opt_t) ? "$opt_t" : "/dev/rmt/0n"; # tape utilities $dd="/bin/dd"; $mt="/bin/mt"; # data file from tape $data = "$datadir/$file"; # make sure this data file name does not already exist if (-e "$data") { print "$prog: $data already exists and will not be overwritten!"; exit 1; } # which program am i? if ($do_edgar) { &extract_edgar; } elsif ($do_patent) { &extract_patent; } elsif ($do_trademark) { &extract_trademark($opt_T); } chmod($data, 0664); exit 0; # # extract SEC EDGAR data from 3480 tape # sub extract_edgar { # audit file from tape local($audit) = "$datadir/audit/$file.audit"; # EDGAR data records have a length of 8196 bytes #local($ddflags) = "ibs=8196"; # new format effective 01-03-95 local($ddflags) = "ibs=32760"; # make sure an audit directory exists if ( ! -d "$datadir/audit") { print "$prog: creating audit directory ..." if $verbose; mkdir("$datadir/audit", 0775); } # be sure we're at the start of tape system("$mt -f $tape rewind"); # skip EBCDIC header system("$mt -f $tape fsf 1"); # grab the data file system("$dd if=$tape of=$data $ddflags 2>/dev/null"); # skip EBCDIC end of data mark and next EBCDIC header system("$mt -f $tape fsf 2"); # grab the audit file system("$dd if=$tape of=$audit $ddflags 2>/dev/null"); # rewind tape and take it offline system("$mt -f $tape rewoff &"); } # # extract Patent data from 3480 tape # sub extract_patent { # Patent data records have a length of 2000 bytes and are 80-byte fixed # length records. local($ddflags) = "ibs=2000 cbs=80 conv=unblock"; # be sure we're at the start of tape system("$mt -f $tape rewind"); # skip ASCII header system("$mt -f $tape fsf 1"); # grab the data file system("$dd if=$tape of=$data $ddflags 2>/dev/null"); # rewind tape and take it offline system("$mt -f $tape rewoff &"); } # # extract Trademark full-text data # sub extract_trademark { # image or text data? local($type) = shift; if ($type eq 'text') { # Trademark text data records have a length of 327 bytes and are blocked # 48 records per block (15696) local($ddflags) = "ibs=15696 cbs=327 conv=unblock"; # be sure we're at the start of tape system("$mt -f $tape rewind"); # read EBCDIC header to get data file name @hdr = `$dd if=$tape ibs=80 conv=ascii 2>/dev/null`; $data = "$datadir/Text/" . substr($hdr[0], 4, 6); # grab the data file system("$dd if=$tape of=$data.raw $ddflags 2>/dev/null"); # rewind tape and take it offline system("$mt -f $tape rewoff &"); } elsif ($type eq 'image') { local($ddflags) = "ibs=20000"; # be sure we're at the start of tape system("$mt -f $tape rewind"); # read EBCDIC header to get data file name @hdr = `$dd if=$tape ibs=80 conv=ascii 2>/dev/null`; $data = "$datadir/Image/" . substr($hdr[0], 4, 6); # grab the image file system("$dd if=$tape of=$data.raw $ddflags 2>/dev/null"); # rewind tape and take it offline system("$mt -f $tape rewoff &"); } else { print "$0: unknown trademark data type ($type)\n"; exit 1; } }