- GRAYBYTE UNDETECTABLE CODES -

403Webshell
Server IP : 184.154.167.98  /  Your IP : 3.16.135.146
Web Server : Apache
System : Linux pink.dnsnetservice.com 4.18.0-553.22.1.lve.1.el8.x86_64 #1 SMP Tue Oct 8 15:52:54 UTC 2024 x86_64
User : puertode ( 1767)
PHP Version : 8.2.26
Disable Function : NONE
MySQL : OFF  |  cURL : ON  |  WGET : ON  |  Perl : ON  |  Python : ON  |  Sudo : ON  |  Pkexec : ON
Directory :  /usr/libexec/pcp/pmdas/slurm/

Upload File :
current_dir [ Writeable ] document_root [ Writeable ]

 

Command :


[ Back ]     

Current File : /usr/libexec/pcp/pmdas/slurm/pmdaslurm.pl
#
# Copyright (c) 2015 Martins Innus.
# All rights reserved.
#
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
# for more details.
#

use strict;
use warnings;

use PCP::PMDA;
use Net::Domain qw( hostname );

use Slurm;
use Slurm::Hostlist;
use Slurm qw(:constant);

use threads;
use threads::shared;

use vars qw( $pmda $slurm );


# indoms
my ( $slurm_indom, $node_job_indom, $job_indom, $user_indom, $node_indom ) = ( 0, 1, 2, 3, 4);

my $GEN_CLUSTER = 0;
my $NODE_JOB_CLUSTER = 1;
my $GLOBAL_JOB_CLUSTER = 2;
my $NODE_CLUSTER = 3;
my $USER_CLUSTER = 4;
my $PARTITION_CLUSTER = 5;

# Slurm parameters we actually care about
# Order matches the metric item number to make the fetch logic easier
# Names match the struct entries in slurm.h:job_info if existing, otherwise prepend with "DERIVED"

my @slurm_job_stats = ( "FILLER_FOR_NUM_JOBS", "job_id", "name", "job_state", "user_id", "batch_host", "submit_time", "start_time", "end_time", "features", "gres", "nodes", "num_nodes", "num_cpus", "ntasks_per_node", "work_dir", "DERIVED_job_alloc_cores");

#
# Shared Variables between slurm gather thread and main pmda thread
#
# Will lock on the variable we want to use.

our %nodejobs :shared = ();
our $numnodes :shared = 0;

#
# End Shared Variables
#

#
# Subs for the slurm polling thread
#

# Who am i. To check for jobs in this host
our $host = hostname;

# Don't need to share these.  Only used in the slurm thread
my $jobs_update_time = 0;
sub slurm_init {
   $slurm = Slurm::new();
}

# Update General Stuff
sub slurm_update_cluster_gen {

    # This could take a while
    # Returns a hash ref that mirrors : node_info_msg_t from slurm.h
    my $nodemsg = $slurm->load_node();

    unless($nodemsg) {
        # This can fail if the slurm controller has not come up yet
        # No good way to figure out if this is just a delay or fatal error
        # So just log it and try again later
        warn "Failed to load node info: " . $slurm->strerror();
        return;
    }

    my @nodes = @{ $nodemsg->{node_array} };

    {
        lock($numnodes);
        $numnodes = @nodes;
    }

}

# Update job information for this node
sub slurm_update_cluster_jobs {

    # Use the time so we only update on changes
    #
    # This could take a while
    # Returns a hash ref where the main element we care about is the job_array array
    my $jobflags = Slurm::SHOW_DETAIL;
    my $jobmsg = $slurm->load_jobs($jobs_update_time, $jobflags);

    unless($jobmsg) {
        # This can fail if the slurm controller has not come up yet
        # Or if there is no state change from the previous try.
        # If there is no state change, the main thread will just use existing data.
        # No good way to figure this out
        # So just try again later
        return;
    }

    # Grab the update time to use for the next query
    $jobs_update_time = $jobmsg->{last_update};

    %nodejobs = ();

    {
        lock(%nodejobs);

        for my $job ( @{ $jobmsg->{job_array} } ){

            my $jid = $job->{job_id};
            my $nodelist = $job->{nodes};

            if( $nodelist){

                my $hostlist = Slurm::Hostlist::create( $nodelist );
                my $num_hosts = $hostlist->count();
            
                if ( $num_hosts > 0){
                    my $mypos = $hostlist->find($host);
                    if( $mypos >= 0){
                        # We only want jobs that are on this host
                        if ( Slurm::IS_JOB_RUNNING($job) ){
                            # And Running
                       
                            # Ugh, shared perl hashes
                            $nodejobs{$jid} = &share( {} );
                            while ( my ($key, $value) = each(%$job)){
                                # Only grab the things we really want, to keep memory down
                                if ( grep{$_ eq $key} @slurm_job_stats ){
                                    $nodejobs{$jid}{$key} = $value;
                                }
                            }
                            #
                            # Grab the cores allocated per node
                            # Not exposed in the API so we need to parse the string representation of the job
                            my $slurmstr = $slurm->sprint_job_info($job,1);
                            my $nodecores = "";
                            # Should always exist, but just in case
                            # Could be multiple matches
                            # SLURM api groups hosts together that have the same cores allocated
                            while ( $slurmstr =~ /Nodes=(\S*) CPU_IDs=(\S*)/g ){
                                my $foundnodes = $1;
                                my $foundcores = $2;
                                my $corehosts = Slurm::Hostlist::create( $foundnodes );
                                while( my $corehost = $corehosts->shift()){
                                    $nodecores .= "$corehost:$foundcores ";
                                }
                            }
                            $nodecores =~ s/\s+$//;
                            $nodejobs{$jid}{"DERIVED_job_alloc_cores"} = "$nodecores";
                        }
                    }
                }
            }
        }
    } # unlock %nodejobs
}

# slurm fetch worker thread
sub poll_slurm {
    while (1){
        slurm_update_cluster_gen();
        slurm_update_cluster_jobs();
        sleep 9;
    }
}

#
# Subs for the main thread
#

#
# fetch is called once by pcp for each refresh and then the fetch callback is
# called to query each statistic individually
#
sub slurm_fetch {

        my @nodejobs_array;

        {
            lock(%nodejobs);

            # Create a [key1, "key1", key2, "key2", .....] array so we can control the inst ids
            while ( my ($key, $val) = each %nodejobs){
                push( @nodejobs_array, $key );
                push( @nodejobs_array, "$key" );
            }
        } # unlock %nodejobs

        $pmda->replace_indom($node_job_indom, \@nodejobs_array);
}

sub slurm_fetch_node_job_callback {
    my ($item, $inst) = @_;

    if ( $item == 0 ){
        my $numjobs;
        {
            lock(%nodejobs);
            $numjobs = keys %nodejobs;
        }
        return ($numjobs, 1);
    }

    my $lookup;
    {
        lock(%nodejobs);
        # Will unlock on scope loss, return or otherwise
        if( !exists $nodejobs{$inst} ){
            return (PM_ERR_INST, 0);
        }
        else {
            $lookup = $nodejobs{$inst};
        }
    }

    if ( $item == 3 ){
        # slurm.node.job.state
        # List all jobs as running since those are the only jobs we grabbed
        return ("running", 1);
    }
    elsif ( $item < scalar @slurm_job_stats ){
        my $rv = $lookup->{ $slurm_job_stats[$item] };
        $rv = "" if !defined $rv;
        return ( $rv, 1);
    }
    else{
        return (PM_ERR_PMID, 0);
    }
}

sub slurm_fetch_callback {
    my ($cluster, $item, $inst) = @_; 

    my $pmid_name = pmda_pmid_name($cluster, $item)
        or return (PM_ERR_PMID, 0);
        #or die "Unknown metric name: cluster $cluster item $item\n";

    if( $cluster == 0 && $item == 0 ){
        return ($numnodes, 1);
    }
    elsif( $cluster == $NODE_JOB_CLUSTER){
        return slurm_fetch_node_job_callback( $item, $inst );
    }
    else{
        return (PM_ERR_PMID, 0);
    }
}

# the PCP::PMDA->new line is parsed by the check_domain rule of the PMDA build
# process, so there are special requirements:  no comments, the domain has to
# be a bare number.
#
our $pmda = PCP::PMDA->new('slurm', 23);

# metrics go here, with full descriptions

# general - cluster 0
$pmda->add_metric(pmda_pmid($GEN_CLUSTER,0), PM_TYPE_U32, PM_INDOM_NULL,
    PM_SEM_INSTANT, pmda_units(0,0,0,0,0,0),
    'slurm.numnodes',
    'Number of slurm nodes',
    '');

# To add - cluster name, controller name, slurm version

# node job cluster 
$pmda->add_metric(pmda_pmid($NODE_JOB_CLUSTER,0), PM_TYPE_U32, PM_INDOM_NULL,
    PM_SEM_INSTANT, pmda_units(0,0,0,0,0,0),
    'slurm.node.job.numjobs',
    'Number of jobs running on this node',
    '');

$pmda->add_metric(pmda_pmid($NODE_JOB_CLUSTER,1), PM_TYPE_U32, $node_job_indom,
    PM_SEM_INSTANT, pmda_units(0,0,0,0,0,0),
    'slurm.node.job.id',
    'Slurm jobid',
    '');

$pmda->add_metric(pmda_pmid($NODE_JOB_CLUSTER,2), PM_TYPE_STRING, $node_job_indom,
    PM_SEM_INSTANT, pmda_units(0,0,0,0,0,0),
    'slurm.node.job.name',
    'Job name',
    '');

$pmda->add_metric(pmda_pmid($NODE_JOB_CLUSTER,3), PM_TYPE_STRING, $node_job_indom,
    PM_SEM_INSTANT, pmda_units(0,0,0,0,0,0),
    'slurm.node.job.state',
    'Job state',
    '');

$pmda->add_metric(pmda_pmid($NODE_JOB_CLUSTER,4), PM_TYPE_U32, $node_job_indom,
    PM_SEM_INSTANT, pmda_units(0,0,0,0,0,0),
    'slurm.node.job.user_id',
    'UID owning this job',
    '');

$pmda->add_metric(pmda_pmid($NODE_JOB_CLUSTER,5), PM_TYPE_STRING, $node_job_indom,
    PM_SEM_INSTANT, pmda_units(0,0,0,0,0,0),
    'slurm.node.job.batch_host',
    'First host in the allocation, the one running the batch script',
    '');

$pmda->add_metric(pmda_pmid($NODE_JOB_CLUSTER,6), PM_TYPE_STRING, $node_job_indom,
    PM_SEM_INSTANT, pmda_units(0,0,0,0,0,0),
    'slurm.node.job.submit_time',
    'Time the job was submitted',
    '');

$pmda->add_metric(pmda_pmid($NODE_JOB_CLUSTER,7), PM_TYPE_STRING, $node_job_indom,
    PM_SEM_INSTANT, pmda_units(0,0,0,0,0,0),
    'slurm.node.job.start_time',
    'Time the job started',
    '');

$pmda->add_metric(pmda_pmid($NODE_JOB_CLUSTER,8), PM_TYPE_STRING, $node_job_indom,
    PM_SEM_INSTANT, pmda_units(0,0,0,0,0,0),
    'slurm.node.job.end_time',
    'Time the job ended',
    '');

$pmda->add_metric(pmda_pmid($NODE_JOB_CLUSTER,9), PM_TYPE_STRING, $node_job_indom,
    PM_SEM_INSTANT, pmda_units(0,0,0,0,0,0),
    'slurm.node.job.features',
    'Features requested by the job',
    '');

$pmda->add_metric(pmda_pmid($NODE_JOB_CLUSTER,10), PM_TYPE_STRING, $node_job_indom,
    PM_SEM_INSTANT, pmda_units(0,0,0,0,0,0),
    'slurm.node.job.gres',
    'Generic resources (gres) requested by the job',
    '');

$pmda->add_metric(pmda_pmid($NODE_JOB_CLUSTER,11), PM_TYPE_STRING, $node_job_indom,
    PM_SEM_INSTANT, pmda_units(0,0,0,0,0,0),
    'slurm.node.job.nodes',
    'Nodes allocated to the job',
    '');

$pmda->add_metric(pmda_pmid($NODE_JOB_CLUSTER,12), PM_TYPE_U32, $node_job_indom,
    PM_SEM_INSTANT, pmda_units(0,0,0,0,0,0),
    'slurm.node.job.num_nodes',
    'Number of nodes requested by the job',
    '');

$pmda->add_metric(pmda_pmid($NODE_JOB_CLUSTER,13), PM_TYPE_U32, $node_job_indom,
    PM_SEM_INSTANT, pmda_units(0,0,0,0,0,0),
    'slurm.node.job.num_cpus',
    'Total cpus requested by the job',
    '');

$pmda->add_metric(pmda_pmid($NODE_JOB_CLUSTER,14), PM_TYPE_U32, $node_job_indom,
    PM_SEM_INSTANT, pmda_units(0,0,0,0,0,0),
    'slurm.node.job.ntasks_per_node',
    'Number of tasks per node requested by the job',
    '');

$pmda->add_metric(pmda_pmid($NODE_JOB_CLUSTER,15), PM_TYPE_STRING, $node_job_indom,
    PM_SEM_INSTANT, pmda_units(0,0,0,0,0,0),
    'slurm.node.job.work_dir',
    'Working directory of the job',
    '');

$pmda->add_metric(pmda_pmid($NODE_JOB_CLUSTER,16), PM_TYPE_STRING, $node_job_indom,
    PM_SEM_INSTANT, pmda_units(0,0,0,0,0,0),
    'slurm.node.job.job_alloc_cores',
    'Cores allocated to the job',
    '');

# To add - batch script, alloc_node, account, partition ?

# global job cluster 
#
# ( priorities for queued jobs, running jobs keep last priortiy)
# Need to add priority func to api/job_info.c take from sprio.c
# then perl wrap.  need to use list fcns

# user cluster

# node cluster

# per cluster cluster - may mirror metrics in general above ???

&slurm_init;

# Thread that queries the slurm state.
#
# Slurm calls can block for a while so need a seperate thread
# so pmcd doesn't kill us if fetch's take too long.

my $slurm_thread = threads->create(\&poll_slurm);
$slurm_thread->detach();

$node_job_indom = $pmda->add_indom($node_job_indom, [], '', '');

$pmda->set_fetch(\&slurm_fetch);
$pmda->set_fetch_callback(\&slurm_fetch_callback);

$pmda->run;

Youez - 2016 - github.com/yon3zu
LinuXploit