Revision 38e3e36f2162e2840d25d86d66b2685fcbffb936 authored by Benjamin R. Hillman on 13 January 2022, 19:12:45 UTC, committed by Benjamin R. Hillman on 13 January 2022, 19:12:45 UTC
1 parent 71c1ae5
Raw File
syslog.cori-haswell
#!/bin/csh -f
# cori-haswell syslog script: 
#  mach_syslog <sampling interval (in seconds)> <job identifier> <timestamp> <run directory> <timing directory> <output directory> 

set sample_interval = $1
set jid = $2
set lid = $3
set run = $4
set timing = $5
set dir = $6

# Wait until job task-to-node mapping information is output before saving output file.
# Target length was determined empirically (maximum number of lines before job mapping 
#  information starts + number of nodes), and it may need to be adjusted in the future.
# (Note that calling script 'touch'es the e3sm log file before spawning this script, so that 'wc' does not fail.)
set nnodes = `scontrol show jobid $jid | grep -F NumNodes | sed 's/ *NumNodes=\([0-9]*\) .*/\1/' `
@ target_lines = 150 + $nnodes
sleep 10
set outlth = `wc \-l $run/e3sm.log.$lid | sed 's/ *\([0-9]*\) *.*/\1/' `
while ($outlth < $target_lines)
  sleep 60
  set outlth = `wc \-l $run/e3sm.log.$lid | sed 's/ *\([0-9]*\) *.*/\1/' `
end

set TimeLimit   = `scontrol show jobid $jid | grep -F TimeLimit | sed 's/^ *RunTime=.*TimeLimit=\([0-9]*:[0-9]*:[0-9]*\) .*/\1/' `
set limit_hours = `echo $TimeLimit | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\1/' `
set limit_mins  = `echo $TimeLimit | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\2/' `
set limit_secs  = `echo $TimeLimit | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\3/' `
if ("X$limit_hours" == "X") set limit_hours = 0
if ("X$limit_mins" == "X")  set limit_mins  = 0
if ("X$limit_secs" == "X")  set limit_secs  = 0
@ limit = 3600 * $limit_hours + 60 * $limit_mins + $limit_secs

set RunTime    = `scontrol show jobid $jid | grep -F RunTime | sed 's/^ *RunTime=\([0-9]*:[0-9]*:[0-9]*\) .*/\1/' `
set runt_hours = `echo $RunTime | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\1/' `
set runt_mins  = `echo $RunTime | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\2/' `
set runt_secs  = `echo $RunTime | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\3/' `
if ("X$runt_hours" == "X") set runt_hours = 0
if ("X$runt_mins" == "X")  set runt_mins  = 0
if ("X$runt_secs" == "X")  set runt_secs  = 0
@ runt = 3600 * $runt_hours + 60 * $runt_mins + $runt_secs

@ remaining = $limit - $runt
cat > $run/Walltime.Remaining <<EOF1
$remaining $sample_interval
EOF1
/bin/cp --preserve=timestamps $run/e3sm.log.$lid $dir/e3sm.log.$lid.$remaining
if ($remaining <= 0) then
  squeue -t R -o  "%.10i %.15P %.20j %.10u %.7a %.2t %.6D %.8C %.10M %.10l" > $dir/squeuef.$lid.$remaining
  squeue -s | grep -v -F extern > $dir/squeues.$lid.$remaining
  # squeue -t R -o  "%.10i %R" > $dir/squeueR.$lid.$remaining
endif

while ($remaining > 0)
  echo "Wallclock time remaining: $remaining" >> $dir/atm.log.$lid.step
  grep -Fa -e "nstep" -e "model date" $run/*atm.log.$lid | tail -n 4 >> $dir/atm.log.$lid.step
  echo "Wallclock time remaining: $remaining" >> $dir/lnd.log.$lid.step
  grep -Fa -e "timestep" -e "model date" $run/*lnd.log.$lid | tail -n 4 >> $dir/lnd.log.$lid.step
  echo "Wallclock time remaining: $remaining" >> $dir/ocn.log.$lid.step
  grep -Fa -e "timestep" -e "Step number" -e "model date" $run/*ocn.log.$lid | tail -n 4 >> $dir/ocn.log.$lid.step
  echo "Wallclock time remaining: $remaining" >> $dir/ice.log.$lid.step
  grep -Fa -e "timestep" -e "istep" -e "model date" $run/*ice.log.$lid | tail -n 4 >> $dir/ice.log.$lid.step
  echo "Wallclock time remaining: $remaining" >> $dir/rof.log.$lid.step
  grep -Fa "model date" $run/*rof.log.$lid | tail -n 4 >> $dir/rof.log.$lid.step
  grep -Fa "model date" $run/*cpl.log.$lid  > $dir/cpl.log.$lid.step-all
  echo "Wallclock time remaining: $remaining" >> $dir/cpl.log.$lid.step
  tail -n 4 $dir/cpl.log.$lid.step-all >> $dir/cpl.log.$lid.step
  /bin/cp --preserve=timestamps -u $timing/* $dir
  # sqs -w -a | grep "^[0-9]* *R *"> $dir/sqswr.$lid.$remaining
  squeue -t R -o  "%.10i %.15P %.20j %.10u %.7a %.2t %.6D %.8C %.10M %.10l" > $dir/squeuef.$lid.$remaining
  squeue -s | grep -v -F extern > $dir/squeues.$lid.$remaining
  # squeue -t R -o  "%.10i %R" > $dir/squeueR.$lid.$remaining
  chmod a+r $dir/*
  # sleep $sample_interval
  set sleep_remaining = $sample_interval
  while ($sleep_remaining > 120)
   sleep 120
   @ sleep_remaining = $sleep_remaining - 120
  end
  sleep $sleep_remaining
  set RunTime    = `scontrol show jobid $jid | grep -F RunTime | sed 's/^ *RunTime=\([0-9]*:[0-9]*:[0-9]*\) .*/\1/' `
  set runt_hours = `echo $RunTime | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\1/' `
  set runt_mins  = `echo $RunTime | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\2/' `
  set runt_secs  = `echo $RunTime | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\3/' `
  if ("X$runt_hours" == "X") set runt_hours = 0
  if ("X$runt_mins" == "X")  set runt_mins  = 0
  if ("X$runt_secs" == "X")  set runt_secs  = 0
  @ runt = 3600 * $runt_hours + 60 * $runt_mins + $runt_secs
  @ remaining = $limit - $runt
  cat > $run/Walltime.Remaining << EOF2
$remaining $sample_interval
EOF2

end
back to top