UCT High Performance Computing Facility
Home        About us       Documentation        Support and Capabilities       Policies        Apply for account

Sun Apr 20 10:34:01 SAST 2025

Users logged in: mshjoh017
Head Node load: 10.28 22.41 20.17 (22.93)     Head Node RAM free: 94%
Currently computing: 0 hours     Jobs running: 0     Jobs queued: 0
Efficiency: 0%    System overview    Summary    Queue accounting    Graphs     Issues

JOBS RUNNING Sorted by priority
#    JOBID      PRTTN       NAME                USER         ACCOUNT        STATE     Q\R-TIME  CORES NODES  NODES\REASON           QOS     PRIORITY    CPUTIME
-----------------------------------------------------------------------------------------------------------------------------------------------------------------

CLUSTER STATUS
PARTITION AVAIL  TIMELIMIT  NODES  STATE NODELIST
 ada*         up 7-02:00:00      1  down* srvcnthpc108
 ada*         up 7-02:00:00      4   idle srvcnthpc[109-112]
 swan         up 1-00:00:00      1  down* srvcnthpc108
 swan         up 1-00:00:00      4   idle srvcnthpc[109-112]
 gpuo         up 41-16:00:0      3  down* srvcntgpu[001-002,004]
 curie        up 31-06:00:0      1 drain* srvcnthpc600

PartitionName=ada
    AllowGroups=ALL AllowAccounts=ALL AllowQos=ALL
    AllocNodes=ALL Default=YES QoS=N/A
    DefaultTime=NONE DisableRootJobs=NO ExclusiveUser=NO GraceTime=0 Hidden=NO
    MaxNodes=UNLIMITED MaxTime=7-02:00:00 MinNodes=0 LLN=NO MaxCPUsPerNode=UNLIMITED
    Nodes=srvcnthpc[108-112]
    PriorityJobFactor=20 PriorityTier=20 RootOnly=NO ReqResv=NO OverSubscribe=NO
    OverTimeLimit=NONE PreemptMode=OFF
    State=UP TotalCPUs=200 TotalNodes=5 SelectTypeParameters=NONE
    JobDefaults=(null)
    DefMemPerCPU=9000 MaxMemPerCPU=9000
    TRESBillingWeights=CPU=1.0,Mem=0.1G
 
 PartitionName=swan
    AllowGroups=ALL AllowAccounts=ALL AllowQos=ALL
    AllocNodes=ALL Default=NO QoS=N/A
    DefaultTime=NONE DisableRootJobs=NO ExclusiveUser=NO GraceTime=0 Hidden=NO
    MaxNodes=UNLIMITED MaxTime=1-00:00:00 MinNodes=0 LLN=NO MaxCPUsPerNode=UNLIMITED
    Nodes=srvcnthpc[108-112]
    PriorityJobFactor=30 PriorityTier=30 RootOnly=NO ReqResv=NO OverSubscribe=NO
    OverTimeLimit=NONE PreemptMode=OFF
    State=UP TotalCPUs=200 TotalNodes=5 SelectTypeParameters=NONE
    JobDefaults=(null)
    DefMemPerCPU=9000 MaxMemPerCPU=9000
    TRESBillingWeights=CPU=1.0,Mem=0.1G
 
 PartitionName=gpuo
    AllowGroups=ALL AllowAccounts=ALL AllowQos=ALL
    AllocNodes=ALL Default=NO QoS=N/A
    DefaultTime=NONE DisableRootJobs=NO ExclusiveUser=NO GraceTime=0 Hidden=NO
    MaxNodes=UNLIMITED MaxTime=41-16:00:00 MinNodes=0 LLN=NO MaxCPUsPerNode=UNLIMITED
    Nodes=srvcntgpu[001-002,004]
    PriorityJobFactor=20 PriorityTier=20 RootOnly=NO ReqResv=NO OverSubscribe=NO
    OverTimeLimit=NONE PreemptMode=OFF
    State=UP TotalCPUs=48 TotalNodes=3 SelectTypeParameters=NONE
    JobDefaults=(null)
    DefMemPerCPU=4000 MaxMemPerCPU=8000
    TRESBillingWeights=CPU=1.0,Mem=0.1G,GRES/gpu=1.0
 
 PartitionName=curie
    AllowGroups=ALL AllowAccounts=ALL AllowQos=ALL
    AllocNodes=ALL Default=NO QoS=N/A
    DefaultTime=NONE DisableRootJobs=NO ExclusiveUser=NO GraceTime=0 Hidden=NO
    MaxNodes=UNLIMITED MaxTime=31-06:00:00 MinNodes=0 LLN=NO MaxCPUsPerNode=UNLIMITED
    Nodes=srvcnthpc[600]
    PriorityJobFactor=20 PriorityTier=20 RootOnly=NO ReqResv=NO OverSubscribe=NO
    OverTimeLimit=NONE PreemptMode=OFF
    State=UP TotalCPUs=64 TotalNodes=1 SelectTypeParameters=NONE
    JobDefaults=(null)
    DefMemPerCPU=2000 MaxMemPerCPU=2000
    TRESBillingWeights=CPU=1.0,Mem=0.1G
 

WORKER NODE STATUS
NodeName=srvcnthpc108 CoresPerSocket=20 
   CPUAlloc=0 CPUTot=40 CPULoad=N/A
   AvailableFeatures=(null)
   ActiveFeatures=(null)
   Gres=(null)
   NodeAddr=srvcnthpc108 NodeHostName=srvcnthpc108 
   RealMemory=384000 AllocMem=0 FreeMem=N/A Sockets=2 Boards=1
   State=DOWN* ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
   Partitions=ada,swan 
   BootTime=None SlurmdStartTime=None
   CfgTRES=cpu=40,mem=375G,billing=77
   AllocTRES=
   CapWatts=n/a
   CurrentWatts=0 LowestJoules=0 ConsumedJoules=0
   ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
   Reason=Not responding [slurm@2025-03-17T13:45:55]

NodeName=srvcnthpc109 Arch=x86_64 CoresPerSocket=20 
   CPUAlloc=0 CPUTot=40 CPULoad=0.01
   AvailableFeatures=(null)
   ActiveFeatures=(null)
   Gres=(null)
   NodeAddr=srvcnthpc109 NodeHostName=srvcnthpc109 Version=18.08
   OS=Linux 3.10.0-1062.12.1.el7.x86_64 #1 SMP Tue Feb 4 23:02:59 UTC 2020 
   RealMemory=384000 AllocMem=0 FreeMem=379975 Sockets=2 Boards=1
   State=IDLE ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
   Partitions=ada,swan 
   BootTime=2025-03-25T19:54:24 SlurmdStartTime=2025-03-25T19:56:29
   CfgTRES=cpu=40,mem=375G,billing=77
   AllocTRES=
   CapWatts=n/a
   CurrentWatts=0 LowestJoules=0 ConsumedJoules=0
   ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s

NodeName=srvcnthpc110 Arch=x86_64 CoresPerSocket=20 
   CPUAlloc=0 CPUTot=40 CPULoad=0.01
   AvailableFeatures=(null)
   ActiveFeatures=(null)
   Gres=(null)
   NodeAddr=srvcnthpc110 NodeHostName=srvcnthpc110 Version=18.08
   OS=Linux 3.10.0-1062.12.1.el7.x86_64 #1 SMP Tue Feb 4 23:02:59 UTC 2020 
   RealMemory=384000 AllocMem=0 FreeMem=380269 Sockets=2 Boards=1
   State=IDLE ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
   Partitions=ada,swan 
   BootTime=2025-03-25T19:52:58 SlurmdStartTime=2025-03-25T19:56:29
   CfgTRES=cpu=40,mem=375G,billing=77
   AllocTRES=
   CapWatts=n/a
   CurrentWatts=0 LowestJoules=0 ConsumedJoules=0
   ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s

NodeName=srvcnthpc111 Arch=x86_64 CoresPerSocket=20 
   CPUAlloc=0 CPUTot=40 CPULoad=0.06
   AvailableFeatures=(null)
   ActiveFeatures=(null)
   Gres=(null)
   NodeAddr=srvcnthpc111 NodeHostName=srvcnthpc111 Version=18.08
   OS=Linux 3.10.0-1062.12.1.el7.x86_64 #1 SMP Tue Feb 4 23:02:59 UTC 2020 
   RealMemory=384000 AllocMem=0 FreeMem=379558 Sockets=2 Boards=1
   State=IDLE ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
   Partitions=ada,swan 
   BootTime=2025-03-25T19:52:44 SlurmdStartTime=2025-03-25T19:56:29
   CfgTRES=cpu=40,mem=375G,billing=77
   AllocTRES=
   CapWatts=n/a
   CurrentWatts=0 LowestJoules=0 ConsumedJoules=0
   ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s

NodeName=srvcnthpc112 Arch=x86_64 CoresPerSocket=20 
   CPUAlloc=0 CPUTot=40 CPULoad=0.04
   AvailableFeatures=(null)
   ActiveFeatures=(null)
   Gres=(null)
   NodeAddr=srvcnthpc112 NodeHostName=srvcnthpc112 Version=18.08
   OS=Linux 3.10.0-1062.12.1.el7.x86_64 #1 SMP Tue Feb 4 23:02:59 UTC 2020 
   RealMemory=384000 AllocMem=0 FreeMem=379383 Sockets=2 Boards=1
   State=IDLE ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
   Partitions=ada,swan 
   BootTime=2025-03-25T19:51:57 SlurmdStartTime=2025-03-25T19:56:29
   CfgTRES=cpu=40,mem=375G,billing=77
   AllocTRES=
   CapWatts=n/a
   CurrentWatts=0 LowestJoules=0 ConsumedJoules=0
   ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s

NodeName=srvcntgpu001 CoresPerSocket=8 
   CPUAlloc=0 CPUTot=16 CPULoad=N/A
   AvailableFeatures=(null)
   ActiveFeatures=(null)
   Gres=gpu:fermi:4
   NodeAddr=srvcntgpu001 NodeHostName=srvcntgpu001 
   RealMemory=64000 AllocMem=0 FreeMem=N/A Sockets=2 Boards=1
   State=DOWN* ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
   Partitions=gpuo 
   BootTime=None SlurmdStartTime=None
   CfgTRES=cpu=16,mem=62.50G,billing=26,gres/gpu=4,gres/gpu:fermi=4
   AllocTRES=
   CapWatts=n/a
   CurrentWatts=0 LowestJoules=0 ConsumedJoules=0
   ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
   Reason=Not responding [slurm@2025-03-17T13:50:55]

NodeName=srvcntgpu002 CoresPerSocket=6 
   CPUAlloc=0 CPUTot=12 CPULoad=N/A
   AvailableFeatures=(null)
   ActiveFeatures=(null)
   Gres=gpu:fermi:4
   NodeAddr=srvcntgpu002 NodeHostName=srvcntgpu002 
   RealMemory=64000 AllocMem=0 FreeMem=N/A Sockets=2 Boards=1
   State=DOWN* ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
   Partitions=gpuo 
   BootTime=None SlurmdStartTime=None
   CfgTRES=cpu=12,mem=62.50G,billing=22,gres/gpu=4,gres/gpu:fermi=4
   AllocTRES=
   CapWatts=n/a
   CurrentWatts=0 LowestJoules=0 ConsumedJoules=0
   ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
   Reason=Not responding [slurm@2025-03-17T13:50:55]

NodeName=srvcntgpu004 CoresPerSocket=10 
   CPUAlloc=0 CPUTot=20 CPULoad=N/A
   AvailableFeatures=(null)
   ActiveFeatures=(null)
   Gres=gpu:kepler:2
   NodeAddr=srvcntgpu004 NodeHostName=srvcntgpu004 
   RealMemory=128000 AllocMem=0 FreeMem=N/A Sockets=2 Boards=1
   State=DOWN* ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
   Partitions=gpuo 
   BootTime=None SlurmdStartTime=None
   CfgTRES=cpu=20,mem=125G,billing=34,gres/gpu=2,gres/gpu:kepler=2
   AllocTRES=
   CapWatts=n/a
   CurrentWatts=0 LowestJoules=0 ConsumedJoules=0
   ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
   Reason=Not responding [slurm@2025-03-17T13:50:55]

NodeName=srvcnthpc600 CoresPerSocket=16 
   CPUAlloc=0 CPUTot=64 CPULoad=N/A
   AvailableFeatures=(null)
   ActiveFeatures=(null)
   Gres=(null)
   NodeAddr=srvcnthpc600 NodeHostName=srvcnthpc600 
   RealMemory=128000 AllocMem=0 FreeMem=N/A Sockets=4 Boards=1
   State=DOWN*+DRAIN ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
   Partitions=curie 
   BootTime=None SlurmdStartTime=None
   CfgTRES=cpu=64,mem=125G,billing=76
   AllocTRES=
   CapWatts=n/a
   CurrentWatts=0 LowestJoules=0 ConsumedJoules=0
   ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
   Reason=Kill task failed [root@2024-12-02T21:57:37]