Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
RC Data Science
DMTCP Checkpointing Tutorial
Commits
f9a9edf5
Commit
f9a9edf5
authored
Mar 01, 2021
by
William Ellis Warriner
Browse files
initial commit
parent
ce0f98aa
Changes
4
Hide whitespace changes
Inline
Side-by-side
env.yml
0 → 100644
View file @
f9a9edf5
name
:
dmtcp-tutorial
channels
:
-
defaults
dependencies
:
-
python=3.7.9
-
mpi4py=3.0.3
job.sh
0 → 100644
View file @
f9a9edf5
#!/bin/bash
#SBATCH --job-name=job
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1
#SBATCH --mem-per-cpu=4G
#SBATCH --partition=express
#SBATCH --time=0:10:00
#SBATCH --output=%x.log
#SBATCH --error=%x.log
module load DMTCP/2.5.0
module load Anaconda3
conda activate dmtcp-tutorial
dmtcp_launch
-i
5
"python -u task.py 100"
# -u means unbuffered, we get logging in real-time
restart_job.sh
0 → 100644
View file @
f9a9edf5
#!/bin/bash
#SBATCH --job-name=job-restart
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1
#SBATCH --mem-per-cpu=4G
#SBATCH --partition=express
#SBATCH --time=0:10:00
#SBATCH --output=%x.log
#SBATCH --error=%x.log
export
DMTCP_COORD_HOST
=
localhost
module load DMTCP/2.5.0
module load Anaconda3
conda activate dmtcp-tutorial
./dmtcp_restart_script.sh
task.py
0 → 100644
View file @
f9a9edf5
from
typing
import
Dict
import
argparse
import
random
import
time
from
mpi4py
import
MPI
import
sys
VALUE
=
"value"
WAIT
=
"wait"
def
print_step
(
current_step
:
int
,
data
:
Dict
[
str
,
float
]):
print
(
f
"Step
{
current_step
+
1
:
>
4
}
:
{
data
[
VALUE
]:
>
6.2
f
}
(
{
data
[
WAIT
]:
>
6.2
f
}
s)"
)
sys
.
stdout
.
flush
()
def
step
()
->
Dict
[
str
,
float
]:
value
=
random
.
lognormvariate
(
mu
=
0.0
,
sigma
=
1.0
)
wait
=
abs
(
random
.
triangular
(
low
=
0.0
,
high
=
1
,
mode
=
0.5
))
time
.
sleep
(
wait
)
return
{
VALUE
:
value
,
WAIT
:
wait
}
def
task
(
step_count
:
int
)
->
None
:
for
current_step
in
range
(
step_count
):
data
=
step
()
print_step
(
current_step
,
data
)
def
task_mpi
(
step_count
:
int
)
->
None
:
# TODO import trace
comm
=
MPI
.
COMM_WORLD
rank
=
comm
.
Get_rank
()
size
=
comm
.
Get_size
()
print_rank
=
size
compute_size
=
size
-
1
DATA_TAG
=
1
for
current_step
in
range
(
step_count
):
print
(
f
"rank:
{
rank
}
, step:
{
current_step
}
"
)
if
rank
==
print_rank
:
print
(
f
"rank:
{
rank
}
, receiving..."
)
req
=
comm
.
irecv
(
source
=
comm
.
MPI_ANY_SOURCE
,
tag
=
DATA_TAG
)
print
(
f
"rank:
{
rank
}
, waiting for recv..."
)
data
=
req
.
wait
()
print
(
f
"rank:
{
rank
}
, received!"
)
print_step
(
**
data
)
else
:
if
current_step
%
compute_size
!=
rank
:
continue
print
(
f
"rank:
{
rank
}
, computing..."
)
data
=
step
()
print
(
f
"rank:
{
rank
}
, sending..."
)
req
=
comm
.
isend
(
{
"current_step"
:
current_step
,
"data"
:
data
},
dest
=
0
,
tag
=
DATA_TAG
)
print
(
f
"rank:
{
rank
}
, waiting for send..."
)
req
.
wait
()
print
(
f
"rank:
{
rank
}
, sent!"
)
def
interface
()
->
None
:
parser
=
argparse
.
ArgumentParser
(
description
=
"Run steps."
)
parser
.
add_argument
(
"steps"
,
metavar
=
"N"
,
nargs
=
1
,
type
=
int
,
help
=
"number of steps to run"
)
parser
.
add_argument
(
"--mpi"
,
metavar
=
"m"
,
action
=
"store_true"
,
help
=
"use mpi?"
)
args
=
parser
.
parse_args
()
task
(
args
.
steps
[
0
])
if
__name__
==
"__main__"
:
interface
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment