#!/usr/bin/perl use LWP::Simple; use Getopt::Long; my $makeup, $help; GetOptions(makeup => \$makeup, help => \$help); die "usage: $0 [-makeup]\nwhere -makeup runs for only the names in makeup.dat\n" if $help; my($dt_start, $tm_start) = GetCurrentDateTime(); print STDERR "Starting up at $dt_start $tm_start.\n"; print STDERR "Error key:\n"; print STDERR " w! = could not retrieve web page\n"; print STDERR " n! = could not get company name\n"; print STDERR " N! = name did not match closely\n"; print STDERR " s! = could not get sector\n"; print STDERR " i! = could not get industry\n"; my $invest_dir = "/Volumes/LEXAR/diss/Signing Volume"; chdir $invest_dir; my %names; if (!$makeup) { my($r3000_syms, $r3000_names) = ReadInIndexConstituents("R3000-2004.csv"); @names{@$r3000_syms} = @$r3000_names; # delete $names{ICGED}; } else { my($makeup_syms, $makeup_names) = ReadInIndexConstituents("makeup.dat"); @names{@$makeup_syms} = @$makeup_names; } my @problem_names; open(SECFILE, ">sectors.csv") or die "Cannot open file to save sectors and industries!"; foreach my $ticker (sort keys %names) { print STDERR "$ticker."; # get archive page my $oldpage_url; my $yahoo_url = "http://finance.yahoo.com/q/in?s=$ticker"; my $archlist_url = "http://web.archive.org/web/*/$yahoo_url"; my $archlist_string = get($archlist_url); my @wayback_lines = split /\n/, $archlist_string; foreach my $wb_line (@wayback_lines) { my($date, $time) = ($wb_line =~ / 0) { $data_string = get($oldpage_url); last if defined $data_string; $retries--; sleep 1; } if (defined $data_string) { print STDERR "."; # my($name) = ($data_string =~ /.+ - (.+) - /); my($sector,$industry) = ($data_string =~ /Sector:<\/td><td [^>]+><a href=[^>]+>(.+)<\/a><\/td><\/tr><tr><td .+>Industry:<\/td><td [^>]+><a href=[^>]+>([^<]+)<\/a><\/td>/); # my $name_flag = # (!defined $name) ? "n!" : # (!NameMatches($name, $names{$ticker})) ? "N!" : "."; # print(STDERR $name_flag); my $sector_flag = (defined $sector) ? "." : "s!"; print(STDERR $sector_flag); my $industry_flag = (defined $industry) ? "." : "i!"; print(STDERR $industry_flag); # if ($name_flag eq "N!") { # print STDERR "'$name' vs '$names{$ticker}'"; # } if ($sector_flag eq "s!" and $industry_flag eq "i!") { print STDERR "r!"; push @problem_names, $ticker; } else { print SECFILE "$ticker,$sector,$industry\n"; } } else { # couldn't get data print STDERR "w!"; push @problem_names, $ticker; } } print STDERR "\n"; } print STDERR "%d Problem names:\n", scalar(@problem_names); print STDERR join(" ", @problem_names); print STDERR "\n"; my($dt_end, $tm_end) = GetCurrentDateTime(); print STDERR "Stopping at $dt_end $tm_end.\n"; exit; sub ReadInIndexConstituents { my($filename) = @_; open(IDX, $filename) or die "Cannot read in index file '$filename'!\n"; my(@lines) = <IDX>; close IDX; chomp @lines; my($tickers, $names) = ([], []); foreach my $line (@lines) { my($tkr,$nm) = split /,/, $line; push @$tickers, $tkr; push @$names, $nm; } return ($tickers, $names); } sub GetCurrentDateTime { my($ss, $mm, $hh, $dy, $mo, $ccyy, $wday) = localtime(); my $date = sprintf("%04d%02d%02d", $ccyy+1900, $mo+1, $dy); my $time = sprintf("%02d:%02d:%02d", $hh, $mm, $ss); return ($date, $time); } sub NameMatches { my($name1, $name2) = @_; $name1 = lc($name1); $name2 = lc($name2); $name1 =~ s/the //; $name1 =~ s/\b(limited|ltd)\b//; $name1 =~ s/\b(international|intl|int'l)\b//; $name1 =~ s/\b(group|grp)\b//; $name1 =~ s/\b(incorporated|inc)\b//; $name1 =~ s/\b(corporation|corp)\b//; $name1 =~ s/\b(company|co)\b//; $name2 =~ s/the //; $name2 =~ s/\b(limited|ltd)\b//; $name2 =~ s/\b(international|intl|int'l)\b//; $name2 =~ s/\b(group|grp)\b//; $name2 =~ s/\b(incorporated|inc)\b//; $name2 =~ s/\b(corporation|corp)\b//; $name2 =~ s/\b(company|comp|co)\b//; $name1 =~ s/[,.'\-!]//g; $name2 =~ s/[,.'\-!]//g; $name1 =~ s/\s+$//; $name2 =~ s/\s+$//; return ($name1 eq $name2); }