#!/usr/bin/perl ####################################################### # # Runs scripts to fit trade direction GLMM. # Fitting requires alternating between: # 1) Estimating the prevailing bids and asks given the # quote delay distribution parameters; # 2) Merging those estimates with the other covariates # needed for fitting the GLMM; # 3) Fitting the GLMM; # 4) Using the fit output to update the best guess of # the parameters and the inverse Hessian (i.e. the # parameter covariance matrix); and, # 5) Returning to step 1 or stopping if the parameter # estimates are "stable enough". # # For performance reasons, step 1 is done in a # distributed fashion. The optimization of parameters # which are not differentiable (delay parameters and # tau) is done by a homemade DFO versions of the BFGS # optimization method. # # Dale W.R. Rosenthal # $Id: RunTradeDirectionGLMMFitting.pl,v 1.5 2008/06/03 15:51:10 dale Exp dale $ ####################################################### use strict; no strict "subs"; use Math::MatrixReal; my $trades_dir = "/home2/dale/trds"; my $streamnbbo_dir = "/home2/dale/streamnbbo"; my $expected_quotes_dir = "/home2/dale/expbidask"; my $glmm_data_dir = "/home2/dale/r"; my $universe_file = "/home2/dale/sectors.full.csv"; $SIG{CHLD} = "IGNORE"; # so forked processes go away when they exit Log("Starting $0"); LogLineStart("Loading sectors..."); my $sectors = ReadInUniverseAndSectors($universe_file); LogLineAppend("done."); LogLineEnd(); LogLineStart("Loading all trades in our universe..."); #my $all_trades; my $all_trades = LoadAllTrades($trades_dir, $sectors); LogLineAppend("done."); LogLineEnd(); # Params: nu, mu, k~3, k~4, tau, betas (intercept, midpoint test, # EMO-ish test w/tau bandwidth, tick test), phis (midpoint, EMO-ish, # tick), and random effect parameters sigma^2_c and sigma^2_d # Initial params are for NYSE, AMEX, Nasdaq, and then global tau my @history; my @initial_param_vals = (2.06,0.5,0,0, 1.5,0.8,0,0, 2.6,0.8,0,0, 0.001); #, (0) x 21); my $params = Math::MatrixReal->new_from_cols([[@initial_param_vals]]); my $numparams = scalar @initial_param_vals; my $I = Math::MatrixReal->new_diag([(1) x $numparams]); my $direction_set = $I->clone(); my $direction = $params->shadow(); my $trial_points = Math::MatrixReal->new($numparams+1, $numparams); my $inv_hessian = Math::MatrixReal->new_diag([(0.1) x $numparams]); my $grad = $params->shadow(); my $old_grad = $grad->shadow(); my $old_grad_negloglik; my $grad_norm = 1; # set large so we enter while loop # initial parameters evaluation my $iter_ct = 1; my @tmp_paramz; # temp variable #my($est_quotes, $glmm_dataset, $base_negloglik, @glmm_params); $params->each(sub { push @tmp_paramz, $_[0] }); my $est_quotes = EstimateQuotes($all_trades, $expected_quotes_dir, $streamnbbo_dir, $iter_ct, @tmp_paramz); my $glmm_dataset = CreateGLMMDataset($glmm_data_dir, $iter_ct, $all_trades, $est_quotes, $sectors, $tmp_paramz[$numparams-1]); my($base_negloglik, @glmm_params) = FitGLMMR($iter_ct, $glmm_data_dir, $glmm_dataset); push @tmp_paramz, @glmm_params; $history[$iter_ct-1] = [$base_negloglik, @tmp_paramz]; Log("-"x30); $iter_ct++; # right step parameters evaluation undef(@tmp_paramz); my($rightparams) = NewRightParams($iter_ct, $numparams-1, $params, $direction_set); $rightparams->each(sub { push @tmp_paramz, $_[0] }); $est_quotes = EstimateQuotes($all_trades, $expected_quotes_dir, $streamnbbo_dir, $iter_ct, @tmp_paramz); $glmm_dataset = CreateGLMMDataset($glmm_data_dir, $iter_ct, $all_trades, $est_quotes, $sectors, $tmp_paramz[$numparams-1]); my $right_negloglik; ($right_negloglik, @glmm_params) = FitGLMMR($iter_ct, $glmm_data_dir, $glmm_dataset); push @tmp_paramz, @glmm_params; $history[$iter_ct-1] = [$right_negloglik, @tmp_paramz]; Log("-"x30); $iter_ct++; # left step parameters evaluation undef(@tmp_paramz); my($leftparams) = NewLeftParams($iter_ct, $numparams-1, $params, $direction_set); $leftparams->each(sub { push @tmp_paramz, $_[0] }); $est_quotes = EstimateQuotes($all_trades, $expected_quotes_dir, $streamnbbo_dir, $iter_ct, @tmp_paramz); $glmm_dataset = CreateGLMMDataset($glmm_data_dir, $iter_ct, $all_trades, $est_quotes, $sectors, $tmp_paramz[$numparams-1]); my $left_negloglik; ($left_negloglik, @glmm_params) = FitGLMMR($iter_ct, $glmm_data_dir, $glmm_dataset); push @tmp_paramz, @glmm_params; $history[$iter_ct-1] = [$left_negloglik, @tmp_paramz]; Log("-"x30); $iter_ct++; # interpolated step parameters evaluation undef(@tmp_paramz); ($params) = NewInterpolatedParams($iter_ct, $numparams-1, $base_negloglik, $left_negloglik, $right_negloglik, $params, $direction_set); $params->each(sub { push @tmp_paramz, $_[0] }); $est_quotes = EstimateQuotes($all_trades, $expected_quotes_dir, $streamnbbo_dir, $iter_ct, @tmp_paramz); $glmm_dataset = CreateGLMMDataset($glmm_data_dir, $iter_ct, $all_trades, $est_quotes, $sectors, $tmp_paramz[$numparams-1]); ($base_negloglik, @glmm_params) = FitGLMMR($iter_ct, $glmm_data_dir, $glmm_dataset); push @tmp_paramz, @glmm_params; $history[$iter_ct-1] = [$base_negloglik, @tmp_paramz]; Log("-"x30); $iter_ct++; while ($grad_norm > 1e-8) { $trial_points->assign_row(1, ~$params); $old_grad_negloglik = $base_negloglik; DIRECTION: for (my $k = 0; $k < $numparams; $k++) { # initial parameters evaluation already done my $params = ~$trial_points->row($k+1); if (($direction_set->element($k+1, 3) == 1 and $params->element(1,1) <= 4) or ($direction_set->element($k+1, 4) == 1 and $params->element(1,1) <= 7) or ($direction_set->element($k+1, 7) == 1 and $params->element(5,1) <= 4) or ($direction_set->element($k+1, 8) == 1 and $params->element(5,1) <= 7) or ($direction_set->element($k+1, 11) == 1 and $params->element(9,1) <= 4) or ($direction_set->element($k+1, 12) == 1 and $params->element(9,1) <= 7)) { $trial_points->assign_row($k+2, ~$params); next DIRECTION; } # right step parameters evaluation undef(@tmp_paramz); my($rightparams, $rightstep) = NewRightParams($iter_ct, $k, $params, $direction_set); $rightparams->each(sub { push @tmp_paramz, $_[0] }); $est_quotes = EstimateQuotes($all_trades, $expected_quotes_dir, $streamnbbo_dir, $iter_ct, @tmp_paramz); $glmm_dataset = CreateGLMMDataset($glmm_data_dir, $iter_ct, $all_trades, $est_quotes, $sectors, $tmp_paramz[$numparams-1]); my $right_negloglik; ($right_negloglik, @glmm_params) = FitGLMMR($iter_ct, $glmm_data_dir, $glmm_dataset); push @tmp_paramz, @glmm_params; $history[$iter_ct-1] = [$right_negloglik, @tmp_paramz]; Log("-"x30); $iter_ct++; # left step parameters evaluation undef(@tmp_paramz); my($leftparams, $leftstep) = NewLeftParams($iter_ct, $k, $params, $direction_set); $leftparams->each(sub { push @tmp_paramz, $_[0] }); $est_quotes = EstimateQuotes($all_trades, $expected_quotes_dir, $streamnbbo_dir, $iter_ct, @tmp_paramz); $glmm_dataset = CreateGLMMDataset($glmm_data_dir, $iter_ct, $all_trades, $est_quotes, $sectors, $tmp_paramz[$numparams-1]); my $left_negloglik; ($left_negloglik, @glmm_params) = FitGLMMR($iter_ct, $glmm_data_dir, $glmm_dataset); push @tmp_paramz, @glmm_params; $history[$iter_ct-1] = [$left_negloglik, @tmp_paramz]; Log("-"x30); $iter_ct++; # interpolated step parameters evaluation undef(@tmp_paramz); my($interpparams, $interpstep) = NewInterpolatedParams($iter_ct, $k, $base_negloglik, $left_negloglik, $right_negloglik, $params, $direction_set); $interpparams->each(sub { push @tmp_paramz, $_[0] }); $est_quotes = EstimateQuotes($all_trades, $expected_quotes_dir, $streamnbbo_dir, $iter_ct, @tmp_paramz); $glmm_dataset = CreateGLMMDataset($glmm_data_dir, $iter_ct, $all_trades, $est_quotes, $sectors, $tmp_paramz[$numparams-1]); my $interp_negloglik; ($interp_negloglik, @glmm_params) = FitGLMMR($iter_ct, $glmm_data_dir, $glmm_dataset); push @tmp_paramz, @glmm_params; $history[$iter_ct-1] = [$interp_negloglik, @tmp_paramz]; Log("-"x30); $iter_ct++; # Require a parameter change to be a descent direction my $min_negloglik = min($base_negloglik, $left_negloglik, $right_negloglik, $interp_negloglik); if ($base_negloglik == $min_negloglik) { $base_negloglik = $interp_negloglik; # $params is already set } elsif ($interp_negloglik = $min_negloglik) { $base_negloglik = $interp_negloglik; $params = $interpparams->clone(); } elsif ($left_negloglik == $min_negloglik) { $base_negloglik = $left_negloglik; $params = $leftparams->clone(); } elsif ($right_negloglik == $min_negloglik) { $base_negloglik = $right_negloglik; $params = $rightparams->clone(); } # do Broyden rank 2 update of inverse hessian if the # likelihood is concave in this neighborhood. # if ($right_negloglik + $left_negloglik > 2*$base_negloglik and # $interpstep->norm_p(1) > 0) { # Log("Updating inverse Hessian approximation."); # # gradient change is only due to one CD move # # scale change for distance moved and apportion # # to scaled step size # my $grad_chg = ($interp_negloglik - $base_negloglik)* # $interpstep/($interpstep->norm_p(1))**2; # my $paramchange = $interpparams - $params; # my $dgdp_prod = $grad_chg->scalar_product($paramchange); # if ($dgdp_prod != 0) { # my $rho = 1/$dgdp_prod; # $inv_hessian = ($I - $rho*$paramchange*~$grad_chg)* # $inv_hessian*($I - $rho*$grad_chg*~$paramchange) + # $rho*$paramchange*~$paramchange; # } # } $trial_points->assign_row($k+2, ~$params); } # update conjugate directions my $new_conj_direction = $trial_points->row($numparams+1) - $trial_points->row(1); if ($new_conj_direction->norm_p(1) > 0) { Log("Updating conjugate directions."); for (my $k = 1; $k < $numparams; $k++) { $direction_set->assign_row($k, $direction_set->row($k+1)); } $direction_set->assign_row($numparams, $new_conj_direction); } else { Log("Skipping update of conjugate directions."); } # Now move along new conjugate direction ### # initial parameters evaluation already done $params = ~$trial_points->row($numparams+1); # right step parameters evaluation undef(@tmp_paramz); my($rightparams, $rightstep) = NewRightParams($iter_ct, $numparams-1, $params, $direction_set); $rightparams->each(sub { push @tmp_paramz, $_[0] }); $est_quotes = EstimateQuotes($all_trades, $expected_quotes_dir, $streamnbbo_dir, $iter_ct, @tmp_paramz); $glmm_dataset = CreateGLMMDataset($glmm_data_dir, $iter_ct, $all_trades, $est_quotes, $sectors, $tmp_paramz[$numparams-1]); my $right_negloglik; ($right_negloglik, @glmm_params) = FitGLMMR($iter_ct, $glmm_data_dir, $glmm_dataset); push @tmp_paramz, @glmm_params; $history[$iter_ct-1] = [$right_negloglik, @tmp_paramz]; Log("-"x30); $iter_ct++; # left step parameters evaluation undef(@tmp_paramz); my($leftparams, $leftstep) = NewLeftParams($iter_ct, $numparams-1, $params, $direction_set); $leftparams->each(sub { push @tmp_paramz, $_[0] }); $est_quotes = EstimateQuotes($all_trades, $expected_quotes_dir, $streamnbbo_dir, $iter_ct, @tmp_paramz); $glmm_dataset = CreateGLMMDataset($glmm_data_dir, $iter_ct, $all_trades, $est_quotes, $sectors, $tmp_paramz[$numparams-1]); my $left_negloglik; ($left_negloglik, @glmm_params) = FitGLMMR($iter_ct, $glmm_data_dir, $glmm_dataset); push @tmp_paramz, @glmm_params; $history[$iter_ct-1] = [$left_negloglik, @tmp_paramz]; Log("-"x30); $iter_ct++; # interpolated step parameters evaluation undef(@tmp_paramz); my($interpparams, $interpstep) = NewInterpolatedParams($iter_ct, $numparams-1, $base_negloglik, $left_negloglik, $right_negloglik, $params, $direction_set); $interpparams->each(sub { push @tmp_paramz, $_[0] }); $est_quotes = EstimateQuotes($all_trades, $expected_quotes_dir, $streamnbbo_dir, $iter_ct, @tmp_paramz); $glmm_dataset = CreateGLMMDataset($glmm_data_dir, $iter_ct, $all_trades, $est_quotes, $sectors, $tmp_paramz[$numparams-1]); my $interp_negloglik; ($interp_negloglik, @glmm_params) = FitGLMMR($iter_ct, $glmm_data_dir, $glmm_dataset); push @tmp_paramz, @glmm_params; $history[$iter_ct-1] = [$interp_negloglik, @tmp_paramz]; Log("-"x30); $iter_ct++; my $paramchange; # since last update of conjugate directions my $min_negloglik = min($base_negloglik, $left_negloglik, $right_negloglik, $interp_negloglik); if ($base_negloglik == $min_negloglik) { $base_negloglik = $interp_negloglik; $paramchange = ~$new_conj_direction; # $params is already set } elsif ($interp_negloglik = $min_negloglik) { $base_negloglik = $interp_negloglik; $params = $interpparams->clone(); $paramchange = ~$new_conj_direction + $interpstep; } elsif ($left_negloglik == $min_negloglik) { $base_negloglik = $left_negloglik; $params = $leftparams->clone(); $paramchange = ~$new_conj_direction + $leftstep; } elsif ($right_negloglik == $min_negloglik) { $base_negloglik = $right_negloglik; $params = $rightparams->clone(); $paramchange = ~$new_conj_direction + $rightstep; } # approximate gradient Log("Approximating gradient."); my(undef, $grad_apportion) = $inv_hessian->decompose_LR()->solve_LR($paramchange); if ($grad_apportion->norm_p(1) == 0 and $grad_apportion->norm_p("Inf") == 0) { $grad_apportion = $grad_apportion->shadow()->each(sub {(shift)+1}); } $old_grad = $grad->clone(); $grad = ($old_grad_negloglik - $min_negloglik)* $grad_apportion/$grad_apportion->norm_p(1); Log("Gradient $grad"); # do Broyden rank 2 update of inverse hessian Log("Updating inverse Hessian approximation."); my $grad_chg = $grad - $old_grad; my $rho = 1/($grad_chg->scalar_product($paramchange)); $inv_hessian = ($I - $rho*$paramchange*~$grad_chg)*$inv_hessian* ($I - $rho*$grad_chg*~$paramchange) + $rho*$paramchange*~$paramchange; $grad_norm = $grad->norm_p(2); Log("Inverse Hessian $inv_hessian"); } Log("-"x30); $iter_ct--; Log("Convergence achieved in $iter_ct iterations."); Log("Step negloglik N.nu N.mu N.eta3 N.eta4 A.nu ". " A.mu A.eta3 A.eta4 Q.nu Q.mu Q.eta3 ". " Q.eta4 tau (int) N.midpt N.tick N.emo ". " N.midpt- N.tick- N.emo- A.midpt A.tick A.emo ". " A.midpt- A.tick- A.emo- Q.midpt Q.tick Q.emo ". " Q.midpt- Q.tick- Q.emo- sigma^2.c sigma^2.d"); Log("---- --------- --------- ".join(" ", ("---------")x34)); my $i = 1; foreach my $hist_entry (@history) { my $outline = sprintf("%4d %4.3g ".join(" ", ("%0.7f")x34), $i, @$hist_entry); Log($outline); $i++; } Log("Inverse of Hessian:\n$inv_hessian"); close STDOUT; Log("Stopping $0"); exit(); sub Log { my($message) = @_; my($ss, $mm, $hh, $dd, $mo, $yr) = localtime(); my $timestamp = sprintf("%4d%02d%02d %02d:%02d:%02d", $yr+1900, $mo+1, $dd, $hh, $mm, $ss); print(STDERR "$timestamp: $message\n"); } sub LogLineStart { my($message) = @_; my($ss, $mm, $hh, $dd, $mo, $yr) = localtime(); my $timestamp = sprintf("%4d%02d%02d %02d:%02d:%02d", $yr+1900, $mo+1, $dd, $hh, $mm, $ss); print(STDERR "$timestamp: $message"); } sub LogLineAppend { my($message) = @_; print(STDERR "$message"); } sub LogLineEnd { print(STDERR "\n"); } sub LoadAllTrades { my($trades_dir, $universe) = @_; my $all_trades = {}; opendir(TRADESDIR, $trades_dir) or die "Cannot open trades directory '$trades_dir'!"; my @entries = readdir(TRADESDIR); closedir(TRADESDIR); chomp @entries; # remove trailing newlines ### ### ### FOR TESTIING: only load two days of trades ### ### my $num_days = 0; foreach my $entry (sort @entries) { ### ### HERE IS WHERE WE LIMIT TO JUST 2 DAYS FOR TESTING ### last if $num_days >= 2; if ($entry =~ /^ArcaTrade_\d{8}\.csv$/) { my($date) = ($entry =~ /^ArcaTrade_(\d{8})/); my $these_trades = GetTrades("$trades_dir/$entry", $universe, $date); $all_trades->{$date} = $these_trades; ### ### HERE IS INCREMENT FOR A LIMITED NUMBER OF TESTING DAYS ### $num_days++; } } return $all_trades; } # This gets all trades in our universe from a file # CRUCIAL ASSUMPTION: All trades in the file are from # one trade date, not multiple dates! sub GetTrades { my($trade_file, $universe, $id4logging) = @_; my $trade_list = {}; open(TRADEFILE, $trade_file) or die "Cannot read in trades!"; my $headerline = ; chomp $headerline; my(@tfieldnames) = split /,/, $headerline; my(@tradelines) = ; close TRADEFILE; chomp @tradelines; LogLineAppend("done."); LogLineEnd(); LogLineStart("Creating trades data structure ($id4logging)..."); my @resolution_queue = (); foreach my $tl (@tradelines) { my @tfields = split /,/, $tl; my($hh, $mm, $ss) = split /:/, $tfields[3]; my $t = []; # time, symbol, volume, price, liq provider buy/sell ## date, @$t = ($hh*60*60 + $mm*60 + $ss, @tfields[6, 7, 8, 10]); # 2, next if !exists $universe->{$t->[TRADE_SYMBOL()]}; if (@resolution_queue > 0 and $t->[TRADE_SYMBOL()] eq $resolution_queue[0]->[TRADE_SYMBOL()] and $t->[TRADE_TIME()] == $resolution_queue[0]->[TRADE_TIME()]) { push @resolution_queue, $t; next; } # now handle same-second trades my $ntrades = scalar @resolution_queue; my $i = 1; if (@resolution_queue > 0) { while (my $sst = shift(@resolution_queue)) { $sst->[TRADE_TIME()] += $i/($ntrades+1); push @{$trade_list->{$sst->[TRADE_SYMBOL()]}}, $sst; $i++; } } push @resolution_queue, $t; } # clean out the queue my $ntrades = scalar @resolution_queue; my $i = 1; while (my $sst = shift @resolution_queue) { $sst->[TRADE_TIME()] += $i/($ntrades+1); push @{$trade_list->{$sst->[TRADE_SYMBOL()]}}, $sst; $i++; } return $trade_list; } sub ReadInUniverseAndSectors { my($universe_filename) = @_; my $universe_map = {}; open(UNIVERSEFILE, $universe_filename) or die "Cannot read in universe (from '$universe_filename')!"; my $headerline = ; chomp $headerline; my(@universe_lines) = ; close UNIVERSEFILE; chomp @universe_lines; foreach my $ul (@universe_lines) { my($symbol, $sector, $industry) = split /,/, $ul; $universe_map->{$symbol} = $sector; } return $universe_map; } # Take partial step in a conjugate direction. (See Nocedal # and Wright, Ch. 9.) sub NewRightParams { my($iter_ct, $k, $params, $direction_set) = @_; LogLineStart("Determining right step parameters..."); my $step_dir = ~$direction_set->row($k+1); my $alpha = 0.8/$step_dir->norm_p(2); while (OutOfBounds($alpha, $step_dir, $params) or OutOfBounds(-$alpha, $step_dir, $params)) { # handle steps which would go outside the parameter space # in either direction (to make parabola fitting easier) $alpha *= 0.8; } $params += $alpha*$step_dir; LogLineAppend("done."); LogLineEnd(); my @parmz; $params->each(sub { push @parmz, $_[0] }); Log(sprintf("Param set #%d: (nu,mu,k~3,k~4)\n NYSE: (%f,%f,%f,%f),\n AMEX:". " (%f,%f,%f,%f),\n Nasdaq: (%f,%f,%f,%f), tau=%f", $iter_ct, @parmz)); return ($params, $alpha*$step_dir); } sub NewLeftParams { my($iter_ct, $k, $params, $direction_set) = @_; LogLineStart("Determining left step parameters..."); my $step_dir = ~$direction_set->row($k+1); my $alpha = 0.6/$step_dir->norm_p(2); while (OutOfBounds($alpha, $step_dir, $params) or OutOfBounds(-$alpha, $step_dir, $params)) { # handle steps which would go outside the parameter space # in either direction (to make parabola fitting easier) $alpha *= 0.8; } $params += -1*$alpha*$step_dir; LogLineAppend("done."); LogLineEnd(); my @parmz; $params->each(sub { push @parmz, $_[0] }); Log(sprintf("Param set #%d: (nu,mu,k~3,k~4)\n NYSE: (%f,%f,%f,%f),\n AMEX:". " (%f,%f,%f,%f),\n Nasdaq: (%f,%f,%f,%f), tau=%f", $iter_ct, @parmz)); return ($params, -1*$alpha*$step_dir); } sub NewInterpolatedParams { my($iter_ct, $k, $f_base, $f_left, $f_right, $params, $direction_set) = @_; LogLineStart("Determining interpolated parameters..."); my $step_dir = ~$direction_set->row($k+1); my $alpha = 0.6/$step_dir->norm_p(2); while (OutOfBounds($alpha, $step_dir, $params) or OutOfBounds(-$alpha, $step_dir, $params)) { # handle steps which would go outside the parameter space # in either direction (to make parabola fitting easier) $alpha *= 0.8; } my $denom = 2*($f_right + $f_left - 2*$f_base); my $scaling = 0; if ($denom > 0) { # function looks convex $scaling = ($f_left - $f_right)/$denom; } elsif ($denom < 0) { $scaling = ($f_left < $f_right) ? -1 : +1; } else { $scaling = 0; } # check we didn't go out of bounds while (OutOfBounds($scaling*$alpha, $step_dir, $params)) { # handle steps which would go outside the parameter space $alpha *= 0.8; } $params += $scaling*$alpha*$step_dir; LogLineAppend("done."); LogLineEnd(); my @parmz; $params->each(sub { push @parmz, $_[0] }); Log(sprintf("Param set #%d: (nu,mu,k~3,k~4)\n NYSE: (%f,%f,%f,%f),\n AMEX:". " (%f,%f,%f,%f),\n Nasdaq: (%f,%f,%f,%f), tau=%f", $iter_ct, @parmz)); return ($params, $scaling*$alpha*$step_dir); } sub OutOfBounds { my($alpha, $step_dir, $params) = @_; my $proposed_params = $alpha*$step_dir + $params; my $problems = 0; my $nu_nyse = $proposed_params->element(1, 1); my $mu_nyse = $proposed_params->element(2, 1); my $nu_amex = $proposed_params->element(5, 1); my $mu_amex = $proposed_params->element(6, 1); my $nu_nasdaq = $proposed_params->element(9, 1); my $mu_nasdaq = $proposed_params->element(10, 1); my $bidask_nbhd = $proposed_params->element(13, 1); $problems++ if $nu_nyse <= 1e-2; $problems++ if $mu_nyse <= 1e-2; $problems++ if $nu_amex <= 1e-2; $problems++ if $mu_amex <= 1e-2; $problems++ if $nu_nasdaq <= 1e-2; $problems++ if $mu_nasdaq <= 1e-2; $problems++ if $bidask_nbhd <= 1e-4; #1bp spread $problems++ if $bidask_nbhd >= 0.05; #5% spread # Constrain mean and std dev implied by delay distribution to # be reasonable -- less than 10 seconds each # This is to fit a version which doesn't use short-term alpha if ($mu_nyse == 0) { $problems++; } else { $problems++ if $nu_nyse/$mu_nyse > 5; $problems++ if $nu_nyse > 0 and sqrt($nu_nyse)/$mu_nyse > 5; } if ($mu_amex == 0) { $problems++; } else { $problems++ if $nu_amex/$mu_amex > 5; $problems++ if $nu_amex > 0 and sqrt($nu_amex)/$mu_amex > 5; } if ($mu_nasdaq == 0) { $problems++; } else { $problems++ if $nu_nasdaq/$mu_nasdaq > 5; $problems++ if $nu_nasdaq > 0 and sqrt($nu_nasdaq)/$mu_nasdaq > 5; } # conservative estimates of lost probability mass due to # truncation at 90 seconds; not bad for nu values of 2 or less # (which is most of what I am dealing with) my $estimated_lost_prob_nyse = 1-(1-exp(-90*$mu_nyse))**$nu_nyse; my $estimated_lost_prob_amex = 1-(1-exp(-90*$mu_amex))**$nu_amex; my $estimated_lost_prob_nasdaq = 1-(1-exp(-90*$mu_nasdaq))**$nu_nasdaq; $problems++ if $estimated_lost_prob_nyse >= 0.01; $problems++ if $estimated_lost_prob_amex >= 0.01; $problems++ if $estimated_lost_prob_nasdaq >= 0.01; return $problems; } sub EstimateQuotes { my($all_trades, $estimated_quotes_dir, $streamnbbo_dir, $iter_ct, @delay_parameters) = @_; my $estimated_bidasks = {}; LogLineStart("Estimating prevailing bids and asks..."); my($nu_nyse, $mu_nyse, $eta3_nyse, $eta4_nyse, $nu_amex, $mu_amex, $eta3_amex, $eta4_amex, $nu_nasdaq, $mu_nasdaq, $eta3_nasdaq, $eta4_nasdaq) = @delay_parameters; if ($iter_ct >= SKIP_TO()) { opendir(STREAMLINEDQUOTESDIR, $streamnbbo_dir) or die "Cannot open directory of streamlined quote files!"; my @streamnbbofiles = readdir(STREAMLINEDQUOTESDIR); closedir(STREAMLINEDQUOTESDIR); chomp @streamnbbofiles; # use fork() to distribute (across multiple CPUs on one machine) # the computation of estimates of the prevailing bids and asks. # technically, the call to system() invokes a second fork; but, # I need to do some setup and teardown for the system call. my $parent_pid = $$; my @problemdates = (); foreach my $streamfile (sort @streamnbbofiles) { next if $streamfile !~ /^quotevectors.\d{8}.csv$/; my($date) = ($streamfile =~ /^quotevectors.(\d{8}).csv$/); if (!exists $all_trades->{$date}) { LogLineAppend(" x${date}x"); next; } LogLineAppend(" $date"); my $slot = WaitForProcessingSlot(); ReserveProcessingSlot($slot); my $process_id = fork(); if ($process_id == 0) { # this is the spawned child my $calculate_command = sprintf("/home2/dale/calc_exp_bidask %f %f %f %f %f %f %f %f". " %f %f %f %f %s 2> %s > %s", $nu_nyse, $mu_nyse, $eta3_nyse, $eta4_nyse, $nu_amex, $mu_amex, $eta3_amex, $eta4_amex, $nu_nasdaq, $mu_nasdaq, $eta3_nasdaq, $eta4_nasdaq, "$streamnbbo_dir/$streamfile", "$estimated_quotes_dir/log.$parent_pid.$iter_ct.$date.log", "$estimated_quotes_dir/estbidask.$iter_ct.$date.csv"); my $returncode = system($calculate_command); $returncode &= 0xffff; # get rid of upper bits; not needed by us push @problemdates, [$date, $returncode] if $returncode == 0; FreeProcessingSlot($slot); exit 0; } } WaitForAllProcessingSlots(); LogLineAppend("processing done."); LogLineEnd(); if (@problemdates > 0) { LogLineStart("Problems:"); foreach my $prob (@problemdates) { # give date and problematic return code LogLineAppend(" $prob->[0] ($prob->[1])"); } LogLineEnd(); } # Read in the output files and store the estimated quotes LogLineStart("Reading in estimated bids and asks..."); opendir(ESTIMATEDQUOTESDIR, $estimated_quotes_dir) or die "Cannot open directory of estimated quote files!"; my @estbidaskfiles = readdir(ESTIMATEDQUOTESDIR); closedir(ESTIMATEDQUOTESDIR); chomp @estbidaskfiles; foreach my $eba_file (@estbidaskfiles) { next if $eba_file !~ /^estbidask.$iter_ct.\d{8}.csv$/; open(ESTBIDASKFILE, "$estimated_quotes_dir/$eba_file") or die "Could not open estimated bid ask file '$eba_file'!"; my @eba_lines = ; close ESTBIDASKFILE; chomp @eba_lines; foreach my $line (@eba_lines) { my($sym, $dt, $tsec, $estbid, $estask) = split /,/, $line; $estbid = undef if $estbid eq "nan"; $estask = undef if $estask eq "nan"; # since we made time in seconds unique per symbol and date # there should be no hash clashes $estimated_bidasks->{$sym}{$dt}{$tsec} = [$estbid, $estask]; } } } LogLineAppend("done."); LogLineEnd(); return $estimated_bidasks; } # create GLMM dataset by merging trades with estimates of # prevailing quotes and EMO-ish test values. # EMO test would yield +1 for trades at prevailing ask # and -1 for trades at prevailing bid. # EMO-ish test gives scaled gaussian to catch trades near # the estimated prevailing bid and ask. Actual value of # EMO-ish test is: # exp(-[(trdprc - estask)/tau]^2) - exp(-[(trdprc - estbid)/tau]^2) sub CreateGLMMDataset { my($dir, $iter, $trades, $est_quotes, $quotes, $tau) = @_; my $logmsg = sprintf("Creating GLMM dataset #%d...", $iter_ct); LogLineStart($logmsg); # The buy_sell field is for the *liquidity providing* # (LP) trade, not for the "initiating" trade. # ('Initiator' as used by Odders-White.) my %lpside_map = (B => 0, # LP buy => Init. sell C => undef, # cross trade X => 1, # LP sell short => Init. buy S => 1); # LP sell long => Init. buy my %sectormnem_map = ("Basic Materials" => "Mat", "Capital Goods" => "CapGd", "Conglomerates" => "Congl", "Consumer Cyclical" => "Cycl", "Consumer Non-Cyclical" => "NCyc", "Energy" => "Enrg", "Financial" => "Finc", "Healthcare" => "Hlth", "Industrial Goods" => "IndGd", "Services" => "Serv", "Technology" => "Tech", "Transportation" => "Trans", "Utilities" => "Util", "" => "NA"); my $missing = "."; my $glmm_datafile = "$dir/glmmdataset.$iter.csv"; if ($iter >= SKIP_TO()) { open(DATAFILE, ">$glmm_datafile") or die "Cannot open file for GLMM dataset, iteration $iter!"; foreach my $dt (sort {$a <=> $b} keys %$trades) { foreach my $sym (sort keys %{$trades->{$dt}}) { my $market = PrimaryExchange($sym); my $sector = $sectormnem_map{$sectors->{$sym}}; # crucial assumption for the trades loop: trades # are time-ordered with earlier trades first my($prev_t, $prev_nztick); # previous trade, non-zero tick # previous classification metric (nee tests) my($prev_midpt_metric, $prev_tick_metric, $prev_emoish_metric); TRADE: foreach my $t (@{$trades->{$dt}{$sym}}) { $prev_nztick = $prev_t if $t->[TRADE_PRICE()] != $prev_t->[TRADE_PRICE()]; my $tick_metric = ($prev_nztick->[TRADE_PRICE()] > 0) ? log($t->[TRADE_PRICE()]) - log($prev_nztick->[TRADE_PRICE()]) : 0; my $tsec = sprintf("%0.7f", $t->[TRADE_TIME()]); if (!exists $est_quotes->{$sym} or !exists $est_quotes->{$sym}{$dt} or !exists $est_quotes->{$sym}{$dt}{$tsec}) { # big trouble in little china; squawk and skip Log("Failed to match trade ($sym, $dt, $tsec)". " and estimated quotes!"); $prev_midpt_metric = 0; $prev_tick_metric = 0; $prev_emoish_metric = 0; $prev_t = $t; next TRADE; } # allocate to a (ten-) five-minute bin my $binminutes = 10; my $bin = int(($tsec - 9*60*60 - 30*60)/($binminutes*60)); next TRADE if $bin < (30/$binminutes); # before 10:00am next TRADE if $bin >= 6*60/$binminutes; # on/after 3:30pm my $dtbin = $dt.":".$bin; my($estbid, $estask) = @{$est_quotes->{$sym}{$dt}{$tsec}}; my $estmid = ($estbid + $estask)/2; my $midpt_metric = ($estmid > 0 and defined $estbid and defined $estask) ? log($t->[TRADE_PRICE()]) - log($estmid) : 0; my $emoask_metric = (!defined $estask or $estask <= 0) ? 0 : exp(-((log($t->[TRADE_PRICE()]) - log($estask))/$tau)**2); my $emobid_metric = (!defined $estbid or $estbid <= 0) ? 0 : exp(-((log($t->[TRADE_PRICE()]) - log($estbid))/$tau)**2); my $emoish_metric = ($emoask_metric - $emobid_metric); my $bs = $lpside_map{$t->[TRADE_LPSIDE()]} if exists $lpside_map{$t->[TRADE_LPSIDE()]}; my $outline = join(",", $sym || $missing, $dt || $missing, $market || $missing, $sector || $missing, $tsec || $missing, $dtbin || $missing, $t->[TRADE_VOLUME()] || $missing, $t->[TRADE_PRICE()] || $missing, $bs, sprintf("%0.17f", $midpt_metric), sprintf("%0.17f", $tick_metric), sprintf("%0.17f", $emoish_metric), sprintf("%0.17f", $prev_midpt_metric), sprintf("%0.17f", $prev_tick_metric), sprintf("%0.17f", $prev_emoish_metric)); print(DATAFILE "$outline\n"); $prev_midpt_metric = $midpt_metric; $prev_tick_metric = $tick_metric; $prev_emoish_metric = $emoish_metric; $prev_t = $t; } } } close DATAFILE; } LogLineAppend("done."); LogLineEnd(); return $glmm_datafile; } sub FitGLMMStata { my($iter_ct, $datafile) = @_; my $stata_cmd = "/usr/local/stata/stata"; my $stata_dir = "/home2/dale/stata"; # set up call and stata code, redirect to file, # then read in file and return results #insheet str7 symbol date str1 market str5 sector timesecs str11 bin /// # volume trdprice isbuy midpttest ticktest emotest prevmidpttest /// # prevticktest prevemotest using %s, comma open(DOFILE, ">$stata_dir/glmmfit.$$.$iter_ct.do") or die "Cannot open stata command file, iter $iter_ct!"; printf(DOFILE <<'SPLUNGE', $datafile); set memory 2g insheet symbol date market sector timesecs bin volume trdprice /// isbuy midpttest ticktest emotest prevmidpttest prevticktest /// prevemotest using %s, comma xi: xtmelogit isbuy i.market*midpttest i.market*ticktest i.market*emotest /// i.market*prevmidpttest i.market*prevticktest i.market*prevemotest /// || bin:R.sector, intpoints(7) display e(ll) display e(ll_c) matrix list e(b) matrix list e(V) exit, clear SPLUNGE close(DOFILE); my $command2run = "$stata_cmd -b do $stata_dir/glmmfit.$$.$iter_ct.do"; my $returncode = system($command2run); $returncode &= 0xffff; # get rid of upper bits; not needed by us Log("Error in running stata: $returncode!") if $returncode == 0; open(OUTFILE, "$stata_dir/glmfit.$$.$iter_ct.log") or die "Cannot open stata log file!"; my @loglines = ; close OUTFILE; chomp @loglines; my $i = 0; while ($loglines[$i++] !~ /^\. display e\(ll\)/) { } my $negloglik = -$loglines[$i++]; while ($loglines[$i++] !~ /^\. matrix list e\(b\)/) { } $i++; # skip blank line after matrix list command $i++; # skip "e(b)" my(@namez, @fieldz); while ($loglines[$i] !~ /\./) { my @tmp_namez = split /\s+/, $loglines[$i++]; shift @tmp_namez; push @namez, @tmp_namez; my @tmp_fieldz = split /\s+/, $loglines[$i++]; shift @tmp_fieldz; # remove the leading "y1" push @fieldz, @tmp_fieldz; $i++; # skip blank line } my %param_hash; @param_hash{@namez} = @fieldz; my @field_order = qw(); my @glmm_params = @param_hash{@field_order}; return ($negloglik, @glmm_params); } sub FitGLMMR { my($iter_ct, $r_dir, $datafile) = @_; Log("Fitting GLMM for dataset #$iter_ct"); # use R with threaded GotoBLAS my $r_cmd = "/usr/local/R_new/bin/R"; # not /usr/local/bin/R if ($iter_ct >= SKIP_TO()) { # set up call and R code, redirect to file, # then read in file and return results # NOTE: PQL is problematic for models which have total # or near total separation: i.e. if fitted values are # effectivelyy 0 or 1 for many/all observations open(CMDFILE, ">$r_dir/glmmfit.$iter_ct.r") or die "Cannot open R command file, iter $iter_ct!"; printf(CMDFILE <<'SPLUNGE', $datafile); trds <- read.csv("%s", header=FALSE) names(trds) <- c("symbol", "date", "market", "sector", "timesec", "bin", "shares", "tradeprice", "isbuy", "midpt", "tick", "emo", "prevmidpt", "prevtick", "prevemo") mktA <- rep(0, length(trds$market)) mktN <- rep(0, length(trds$market)) mktT <- rep(0, length(trds$market)) mktA[trds$market == "A"] <- 1 mktN[trds$market == "N"] <- 1 mktT[trds$market == "T"] <- 1 library(lme4) g1 <- lmer(isbuy ~ mktA:(emo+prevemo) + mktN:(midpt+tick+emo+prevemo) + mktT:(midpt+tick+emo+prevemo) + (1 | bin/sector), method="PQL", # "Laplace" data = trds, family=binomial) #quasibinomial summary(g1) vcov(g1) logLik(g1) quit(save="no") SPLUNGE close(CMDFILE); # There's more than one way to skin a cat. Unfortunately, # some of these ways can suck. # # The lmer() function, when fit via PQL, doesn't note that # the warnings about separability ([some] fitted values are # numerically 0 or 1) are SERIOUS. The random effects are # often then returned with std devs of 5e-10 -- which is # garbage. If you're not familiar with PQL's problems with # separable models, this is your only clue that something # is wrong. Cubs! Woo! Ernie! Woo! Banks! Woo! lme4! Woo? # Hey, at least this CAN work -- unlike the other methods. # And the Laplace method seems correct -- albeit S-L-O-W. # #library(lme4) #g1 <- lmer(isbuy ~ mktA:(tick+emo+prevemo) + # mktN:(midpt+tick+emo+prevemo+prevmidpt) + # mktT:(midpt+tick+emo+prevemo+prevmidpt) + # (1 | bin/sector), method="$method", # data = trds, family=binomial) #quasibinomial # The PQL code from the ubiquitous MASS library is more obvious # in its failure. However, when it works, it still does not # return even an approximate log-likelihood. This makes model # selection impossible -- and renders this code little more than # a curiosity for those unwilling to develop their own measures # of model adequacy. I doubt Ronnie Woo-Woo would be a fan of # this at all. # #library(MASS) #g1 <- glmmPQL(isbuy ~ mktA:(emo+prevemo) + # mktN:(midpt+tick+emo+prevemo) + # mktT:(midpt+tick+emo+prevemo), # random = ~ 1 | bin/sector, # data = trds, family=binomial) #quasibinomial # Most appealing would be to just do an unbalanced analysis of # deviance. Searle, Casella, and McCulloch (1992) has a whole # chapter on just this; and, it would be perfect. Except that # the way glm() is implemented, it is a memory pig and uses # low-level BLAS calls. This means linking in to a threaded # linear algebra library does NOT speed it up. It also means # that you may be able to fit the model and then unable to # create an ANODE table. So this is a total no-go. No chance # of winning. Not even the Cubs are that hopeless. # #gc() #g1 <- glm(isbuy ~ bin/sector + # mktA:(tick+emo+prevemo) + # mktN:(midpt+tick+emo+prevemo+prevmidpt) + # mktT:(midpt+tick+emo+prevemo+prevmidpt), # model=FALSE, data = trds, family=binomial) #quasibinomial #gc() # #anova.glm(g1) my $command2run = "$r_cmd CMD BATCH $r_dir/glmmfit.$iter_ct.r $r_dir/glmmfit.$iter_ct.out"; my $returncode = system($command2run); $returncode &= 0xffff; # only examine last 32 bits $returncode >>= 8; # shift right 8 bits Log("Error in running R: $returncode!") if $returncode == 0; } unlink($datafile); open(OUTFILE, "$r_dir/glmmfit.$iter_ct.out") or die "Cannot open R log file!"; my @loglines = ; close OUTFILE; chomp @loglines; my $i = 0; while ($loglines[$i++] !~ /logLik/) { } $loglines[$i] =~ s/^\s+//; my(@tmp_fieldz); push @tmp_fieldz, split(/\s+/, $loglines[$i++]); my $negloglik = -$tmp_fieldz[2]; my(@fieldz); # collect random effects estimates while ($loglines[$i++] !~ /Random effects:/) { } $i++; # skip header line before random effects in summary() command while ($loglines[$i] !~ /number of obs:/) { $loglines[$i] =~ s/^\s+//; my @tmp_fieldz = split /\s+/, $loglines[$i++]; push @fieldz, $tmp_fieldz[2]; } # collect fixed effects estimates while ($loglines[$i++] !~ /Fixed effects:/) { } $i++; # skip header line after fixed effects in summary() command while ($loglines[$i] !~ /^---/ and $loglines[$i] !~ /Correlation:/ and $loglines[$i] !~ /Warning/) { $loglines[$i] =~ s/^\s+//; my @tmp_fieldz = split /\s+/, $loglines[$i++]; push @fieldz, $tmp_fieldz[1]; } # we don't actually collect the covariance matrix -- # although I do use the final version in my analysis # my @field_order = (2, 6..11, 3..5, 12..17, 0, 1); my @field_order = (3, 7..11,0, 4..6,0,0,0, 12..16,0, 1, 2); my @glmm_params = (0)x21; @glmm_params = @fieldz[@field_order]; Log("Neg. Log-lik: $negloglik"); # penalize negative log-likelihood to eliminate identifiability # problems of midpoint test versus EMO-ish test with \tau \to \infty # $negloglik += 10*($paramarray[12]/0.001)**2; # Log("Penalized Neg. Log-lik: $negloglik"); Log("GLMM Params: ".join(", ", @glmm_params)); Log("Done with GLMM fitting for dataset #$iter_ct"); return ($negloglik, @glmm_params); } sub RememberParams { my($params) = @_; my $param_list = []; $params->each(sub { push @$param_list, $_[0] }); push @::tried_params, $param_list; } sub TriedPreviously { my($params) = @_; my $times_tried = 0; PARAM_SET: foreach my $param_list (@::tried_params) { for (my $i = 0; $i < @$param_list; $i++) { next PARAM_SET if $param_list->[$i] != $params->element($i+1, 1); } $times_tried++; } return $times_tried; } # convenience/abstraction functions # sub WaitForProcessingSlot { my $num_slots = 8; # one estimation process per CPU (could do more) my $slot2use; # I hate that this might never terminate -- even though the # style is classic old-school C. Could add a $max_iter variable # but... meh, who cares? Then I'd have to error check the slot # code I return and handle that when I'd probably just kill the # script... which is what I would do for a runaway loop as well. # So the answer is: moot. SLOT_WAIT: while (1) { foreach my $i (1..$num_slots) { # exit while loop if a slot is available if (ProcessingSlotAvailable($i)) { $slot2use = $i; last SLOT_WAIT; } } sleep(10); # wait 10 seconds between checks } return $slot2use; } sub WaitForAllProcessingSlots { my $num_slots = 8; # one estimation process per CPU (could do more) while (1) { my $all_slots_open = 1; foreach my $i (1..$num_slots) { $all_slots_open &= ProcessingSlotAvailable($i) } last if $all_slots_open; sleep(10); # wait 10 seconds between checks } } sub ProcessingSlotAvailable { my($slotnumber) = @_; return !-e ".slot.$slotnumber"; } sub ReserveProcessingSlot { my($slotnumber) = @_; system("touch .slot.$slotnumber"); } sub FreeProcessingSlot { my($slotnumber) = @_; unlink(".slot.$slotnumber"); } sub Seconds2Time { my($seconds) = @_; my $hh = int($seconds / (60*60)); $seconds -= $hh*60*60; my $mm = int($seconds / 60); $seconds -= $mm*60; return sprintf("$hh:%02d:%02.4f", $mm, $seconds); } sub PrimaryExchange { my($symbol) = @_; my @amex_names = qw(AFP AVD AVN AX BCP BHL BIO BL BMI CAC CAS CHC COI CPD CTO CUB DAR DFC DHB DHC END FIZ GRC GSX GTE GW HH HT HTC IMA IVX KFX LB LGN LNG MIX MLP MSS MWP NBY NHC NHR NVR OHB OMR PDC PGC PRK PRZ PSB RIV SEB SJW STB TBV TDS TIV TKO TMP TPY TWW USM WFD WSC); my @nyse_multiclass = qw(BF.A BF.B CRD.A CRD.B FCE.A FCE.B FSL FSL.B JW.A JW.B KV.A KV.B MOG.A MOG.B NWS NWSA VIA VIA.B SQA.A SQA.B TRX TRX.B TRY TRY.B); my $primary_exchange; if (grep /^$symbol$/, @amex_names) { $primary_exchange = "A"; } elsif (grep /^$symbol$/, @nyse_multiclass) { $primary_exchange = "N"; } elsif (length($symbol) >= 4) { $primary_exchange = "T"; } else { # only non-AMEX 1, 2, and 3-letter symbols left $primary_exchange = "N"; } return $primary_exchange; } sub min { return undef if @_ == 0; my $i = 0; my $min = $_[$i++]; while ($i < @_) { $min = $_[$i] if $_[$i] < $min; $i++; } return $min; } sub TRADE_DATE { undef } # field 2 in file sub TRADE_TIME { 0 } # field 3 in file sub TRADE_SYMBOL { 1 } # field 6 in file sub TRADE_VOLUME { 2 } # field 7 in file sub TRADE_PRICE { 3 } # field 8 in file sub TRADE_LPSIDE { 4 } # field 10 in file # INSANELY useful for debugging. Let's you skip ahead to a certain # iteration without recreating data files and redoing fitting. Can # also recover from a crashed process -- so long as output files # are still in place. sub SKIP_TO { 172 } # can use to skip initial fit and perturbances: # 3x4 delay params, 1 EMO-ish param