#!/usr/local/bin/perl # # file: create-index # auth: Brad Burdick # desc: create SEC EDGAR SGML index file # # usage: create-index [-D YYMMDD] [-M mod_time] [-m] [-d datadir] # [input_file] # ########################################################################## # Copyright (c) 1994, 1995 Internet Multicasting Service # # The SEC EDGAR Level 1 Dissemination processing software ("software") # was developed by the Internet Multicasting Service and may # be used for academic, research, government, and internal business # purposes without charge. You may not resell this code or include it # in a product that you are selling without prior permission of the # Internet Multicasting Service. # # This software is provided ``as is'', without express or implied # warranty, and with no support nor obligation to assist in its # use, correction, modification or enhancement. We assume no liability # with respect to the infringement of copyrights, trade secrets, or any # patents, and are not responsible for consequential damages. Proper # use of the software is entirely the responsibility of the user. ########################################################################## eval 'exec /usr/bin/perl -s $0 ${1+"$@"}' if 0; # who am i? ($prog = $0) =~ s#.*/##; # where we find our local libraries push(@INC, "/usr/local/ims/lib"); # for processing command line options require 'getopts.pl'; # header values for index files require 'index-hdr.pl'; # Edgar date manipulation routines require 'edgar-date.pl'; # miscellaneous support routines require 'edgar-util.pl'; # process command line options, if any &Getopts('D:M:d:m'); # true if master index, otherwise assume daily index $do_master = defined($opt_m); # date stamp to use in file name $date = defined($opt_D) ? "$opt_D" : ""; # last modification time of files to search on # - defaults to last 6 hours $mod_time = defined($opt_M) ? "$opt_M" : ".25"; # where to place index files $datadir = defined($opt_d) ? "$opt_d" : "/ftp/edgar"; &makepath($datadir, 0755); # where to place daily index file(s) $daily = "daily-index"; # where to place full index file(s) $full = "full-index"; # message of the day $motd = "\n\n" . "ATTENTION: Second quarter index data has been archived to the\n" . " following subdirectory: edgar/full-index/1995/QTR2/.\n\n" . " First quarter index data has been archived to the\n" . " following subdirectory: edgar/full-index/1995/QTR1/.\n"; # days from beginning of year to end of last qtr (will change w/ each qtr) # -- 90 is for 1st qtr (Mar 31) # -- 181 is for 2nd qtr (Jun 30) # -- 275 is for 3rd qtr (Sep 30) @today = localtime; $start_day = $today[7] - 185; # command used to create file list for index $find_args = "data -depth -type f -name '*.sgml' -print"; # archive command lines $compress = "/bin/compress -c"; $sit = "/usr/local/bin/sit -u -C edgar -o"; $zip = "/usr/local/bin/zip -b /tmp -jlq -9"; # don't allow access to files until we're done $oldumask = umask(077); # daily or master index info @index = (); # data file's date @today = &edgar_date; $date = sprintf("%02d%02d%02d", $today[5], $today[4]+1, $today[3]) unless $date; if ($do_master) { $outfile = "$datadir/$full/master.idx"; @header = @master_hdr; open(FIND, "(chdir $datadir ; /bin/find $find_args) |") || die "$prog: error getting file list: $!\n"; } else { $outfile = "$datadir/$daily/master.$date.idx"; @header = @daily_hdr; open(FIND, "$datadir/$full/master.idx") || die "$prog: error getting file list: $!\n"; } # process index file header &process_hdr(*header); # # main processing loop # FILE: while ($path = ) { chop($path); if (! $do_master) { next FILE unless ($path =~ /^[0-9]/); local($cik,$cname,$form,$date,$file) = split(/\|/, $path); # in case we left some files in the install directories next FILE if ($file =~ m#/private/#o); # assume daily files modified within last $mod_time hours (approximately) next FILE unless -M "/ftp/$file" <= $mod_time; push(@index, $path); } else { local($current_cik, $name, $type, $filedate); # master files are searched from beginning of QTR next FILE unless -M "$datadir/$path" <= $start_day; # $path should have data?/cik/file.hdr.sgml format at this point ($base, $cik, $file) = split("/", $path); # open file to get submission information open(IN, "$datadir/$path") || warn "$prog: unable to open $datadir/$path: $!\n", next FILE; LINE: while ($line = ) { chop($line); if ($line =~ //) { ($name = $line) =~ s/(.*)/\1/; } elsif ($line =~ //) { ($current_cik = $line) =~ s/0*(.*)/\1/; # handle all zero (0) CIKs if (! $current_cik) { $current_cik = "0" x 10; } } elsif ($line =~ //) { ($type = $line) =~ s/(.*)/\1/; } elsif ($line =~ //) { ($filedate = $line) =~ s/(.*)/\1/; } else { next LINE; } next LINE unless ($name && $type && $filedate && $current_cik); # index file points to *.txt file $path =~ s/hdr\.sgml/txt/o; # save index entry push(@index, sprintf("%.10s|%.60s|%.10s|%.8s|edgar/%.55s", $current_cik, $name, $type, $filedate, $path)); undef $name, $current_cik; } } } if ($do_master) { @sortedindex = sort(@index); &dedup(*sortedindex); } else { @sortedindex = @index; &dedup(*sortedindex); } open(INDEX, ">$outfile") || die "$prog: unable to open $outfile: $!\n"; print INDEX join("\n", @header), "\n"; # print message of the day if available print INDEX $motd if ($do_master && $motd); print INDEX sprintf("\n%-10.10s|%-20.20s|%-10.10s|%-10.10s|%-s\n", 'CIK', 'Company Name', 'Form Type', 'Date Filed', 'File Name'); print INDEX "-" x 80, "\n"; print INDEX join("\n", @sortedindex), "\n"; close(INDEX); # file is ready chmod(0644, $outfile); # restore umask umask($oldumask); # pack index files if ($do_master) { system("chdir $datadir/$full ; $compress master.idx > master.Z"); system("chdir $datadir/$full ; $sit master.sit master.idx"); system("chdir $datadir/$full ; $zip master.zip master.idx"); } exit 0; # # process index header # expects fixed format from header - see lib/index-hdr.pl # sub process_hdr { local(*header) = shift; local($recv) = sprintf("%s %02d, %04d", $Months[int(substr($date, 2, 2))], substr($date, 4, 2), 1900+int(substr($date, 0, 2))); $header[0] =~ s|%s||; # last data recv'd date $header[1] =~ s/%s/$recv/; if ($do_master) { $header[6] =~ s|%s|$full/master.idx|; } else { $header[6] =~ s|%s|$daily/master.$date.idx|; } } # # delete duplicate record info # - assumes they are already sorted # sub dedup { local(*list) = shift; local($current, $previous) = ''; local(@tmp) = (); for (@list) { local($cik,$name,$form,$date,$path) = split(/\|/); local($file) = (split(/\//, $path))[3]; $current = join("#", $name, $form, $date, $file); if ($current ne $previous) { push(@tmp, $_); } $previous = $current; } @list = @tmp; }