2019-12-27 15:55:18 +01:00
/* SPDX-License-Identifier: GPL-2.0-only */
# ifndef _NET_ETHTOOL_NETLINK_H
# define _NET_ETHTOOL_NETLINK_H
# include <linux/ethtool_netlink.h>
# include <linux/netdevice.h>
# include <net/genetlink.h>
2019-12-27 15:55:23 +01:00
# include <net/sock.h>
struct ethnl_req_info ;
2020-03-12 21:07:38 +01:00
int ethnl_parse_header_dev_get ( struct ethnl_req_info * req_info ,
const struct nlattr * nest , struct net * net ,
struct netlink_ext_ack * extack ,
bool require_dev ) ;
2019-12-27 15:55:23 +01:00
int ethnl_fill_reply_header ( struct sk_buff * skb , struct net_device * dev ,
u16 attrtype ) ;
struct sk_buff * ethnl_reply_init ( size_t payload , struct net_device * dev , u8 cmd ,
u16 hdr_attrtype , struct genl_info * info ,
void * * ehdrp ) ;
2020-07-09 17:42:47 -07:00
void * ethnl_dump_put ( struct sk_buff * skb , struct netlink_callback * cb , u8 cmd ) ;
2020-05-10 21:12:35 +02:00
void * ethnl_bcastmsg_put ( struct sk_buff * skb , u8 cmd ) ;
int ethnl_multicast ( struct sk_buff * skb , struct net_device * dev ) ;
2019-12-27 15:55:23 +01:00
/**
* ethnl_strz_size ( ) - calculate attribute length for fixed size string
* @ s : ETH_GSTRING_LEN sized string ( may not be null terminated )
*
* Return : total length of an attribute with null terminated string from @ s
*/
static inline int ethnl_strz_size ( const char * s )
{
return nla_total_size ( strnlen ( s , ETH_GSTRING_LEN ) + 1 ) ;
}
/**
* ethnl_put_strz ( ) - put string attribute with fixed size string
2021-04-12 11:47:07 -07:00
* @ skb : skb with the message
* @ attrtype : attribute type
* @ s : ETH_GSTRING_LEN sized string ( may not be null terminated )
2019-12-27 15:55:23 +01:00
*
* Puts an attribute with null terminated string from @ s into the message .
*
* Return : 0 on success , negative error code on failure
*/
static inline int ethnl_put_strz ( struct sk_buff * skb , u16 attrtype ,
const char * s )
{
unsigned int len = strnlen ( s , ETH_GSTRING_LEN ) ;
struct nlattr * attr ;
attr = nla_reserve ( skb , attrtype , len + 1 ) ;
if ( ! attr )
return - EMSGSIZE ;
memcpy ( nla_data ( attr ) , s , len ) ;
( ( char * ) nla_data ( attr ) ) [ len ] = ' \0 ' ;
return 0 ;
}
/**
* ethnl_update_u32 ( ) - update u32 value from NLA_U32 attribute
* @ dst : value to update
* @ attr : netlink attribute with new value or null
* @ mod : pointer to bool for modification tracking
*
* Copy the u32 value from NLA_U32 netlink attribute @ attr into variable
* pointed to by @ dst ; do nothing if @ attr is null . Bool pointed to by @ mod
* is set to true if this function changed the value of * dst , otherwise it
* is left as is .
*/
static inline void ethnl_update_u32 ( u32 * dst , const struct nlattr * attr ,
bool * mod )
{
u32 val ;
if ( ! attr )
return ;
val = nla_get_u32 ( attr ) ;
if ( * dst = = val )
return ;
* dst = val ;
* mod = true ;
}
/**
* ethnl_update_u8 ( ) - update u8 value from NLA_U8 attribute
* @ dst : value to update
* @ attr : netlink attribute with new value or null
* @ mod : pointer to bool for modification tracking
*
* Copy the u8 value from NLA_U8 netlink attribute @ attr into variable
* pointed to by @ dst ; do nothing if @ attr is null . Bool pointed to by @ mod
* is set to true if this function changed the value of * dst , otherwise it
* is left as is .
*/
static inline void ethnl_update_u8 ( u8 * dst , const struct nlattr * attr ,
bool * mod )
{
u8 val ;
if ( ! attr )
return ;
val = nla_get_u8 ( attr ) ;
if ( * dst = = val )
return ;
* dst = val ;
* mod = true ;
}
/**
* ethnl_update_bool32 ( ) - update u32 used as bool from NLA_U8 attribute
* @ dst : value to update
* @ attr : netlink attribute with new value or null
* @ mod : pointer to bool for modification tracking
*
* Use the u8 value from NLA_U8 netlink attribute @ attr to set u32 variable
* pointed to by @ dst to 0 ( if zero ) or 1 ( if not ) ; do nothing if @ attr is
* null . Bool pointed to by @ mod is set to true if this function changed the
* logical value of * dst , otherwise it is left as is .
*/
static inline void ethnl_update_bool32 ( u32 * dst , const struct nlattr * attr ,
bool * mod )
{
u8 val ;
if ( ! attr )
return ;
val = ! ! nla_get_u8 ( attr ) ;
if ( ! ! * dst = = val )
return ;
* dst = val ;
* mod = true ;
}
2023-01-23 13:57:39 +00:00
/**
* ethnl_update_bool ( ) - updateb bool used as bool from NLA_U8 attribute
* @ dst : value to update
* @ attr : netlink attribute with new value or null
* @ mod : pointer to bool for modification tracking
*
* Use the bool value from NLA_U8 netlink attribute @ attr to set bool variable
* pointed to by @ dst to 0 ( if zero ) or 1 ( if not ) ; do nothing if @ attr is
* null . Bool pointed to by @ mod is set to true if this function changed the
* logical value of * dst , otherwise it is left as is .
*/
static inline void ethnl_update_bool ( bool * dst , const struct nlattr * attr ,
bool * mod )
{
u8 val ;
if ( ! attr )
return ;
val = ! ! nla_get_u8 ( attr ) ;
if ( ! ! * dst = = val )
return ;
* dst = val ;
* mod = true ;
}
2019-12-27 15:55:23 +01:00
/**
2021-06-02 14:54:28 +08:00
* ethnl_update_binary ( ) - update binary data from NLA_BINARY attribute
2019-12-27 15:55:23 +01:00
* @ dst : value to update
* @ len : destination buffer length
* @ attr : netlink attribute with new value or null
* @ mod : pointer to bool for modification tracking
*
* Use the u8 value from NLA_U8 netlink attribute @ attr to rewrite data block
* of length @ len at @ dst by attribute payload ; do nothing if @ attr is null .
* Bool pointed to by @ mod is set to true if this function changed the logical
* value of * dst , otherwise it is left as is .
*/
static inline void ethnl_update_binary ( void * dst , unsigned int len ,
const struct nlattr * attr , bool * mod )
{
if ( ! attr )
return ;
if ( nla_len ( attr ) < len )
len = nla_len ( attr ) ;
if ( ! memcmp ( dst , nla_data ( attr ) , len ) )
return ;
memcpy ( dst , nla_data ( attr ) , len ) ;
* mod = true ;
}
/**
* ethnl_update_bitfield32 ( ) - update u32 value from NLA_BITFIELD32 attribute
* @ dst : value to update
* @ attr : netlink attribute with new value or null
* @ mod : pointer to bool for modification tracking
*
* Update bits in u32 value which are set in attribute ' s mask to values from
* attribute ' s value . Do nothing if @ attr is null or the value wouldn ' t change ;
* otherwise , set bool pointed to by @ mod to true .
*/
static inline void ethnl_update_bitfield32 ( u32 * dst , const struct nlattr * attr ,
bool * mod )
{
struct nla_bitfield32 change ;
u32 newval ;
if ( ! attr )
return ;
change = nla_get_bitfield32 ( attr ) ;
newval = ( * dst & ~ change . selector ) | ( change . value & change . selector ) ;
if ( * dst = = newval )
return ;
* dst = newval ;
* mod = true ;
}
/**
* ethnl_reply_header_size ( ) - total size of reply header
*
* This is an upper estimate so that we do not need to hold RTNL lock longer
* than necessary ( to prevent rename between size estimate and composing the
* message ) . Accounts only for device ifindex and name as those are the only
* attributes ethnl_fill_reply_header ( ) puts into the reply header .
*/
static inline unsigned int ethnl_reply_header_size ( void )
{
return nla_total_size ( nla_total_size ( sizeof ( u32 ) ) +
nla_total_size ( IFNAMSIZ ) ) ;
}
2019-12-27 15:55:38 +01:00
/* GET request handling */
/* Unified processing of GET requests uses two data structures: request info
* and reply data . Request info holds information parsed from client request
* and its stays constant through all request processing . Reply data holds data
* retrieved from ethtool_ops callbacks or other internal sources which is used
* to compose the reply . When processing a dump request , request info is filled
* only once ( when the request message is parsed ) but reply data is filled for
* each reply message .
*
* Both structures consist of part common for all request types ( struct
* ethnl_req_info and struct ethnl_reply_data defined below ) and optional
* parts specific for each request type . Common part always starts at offset 0.
*/
2019-12-27 15:55:23 +01:00
/**
* struct ethnl_req_info - base type of request information for GET requests
* @ dev : network device the request is for ( may be null )
2021-12-06 17:30:37 -08:00
* @ dev_tracker : refcount tracker for @ dev reference
2019-12-27 15:55:23 +01:00
* @ flags : request flags common for all request types
*
2019-12-27 15:55:38 +01:00
* This is a common base for request specific structures holding data from
* parsed userspace request . These always embed struct ethnl_req_info at
* zero offset .
2019-12-27 15:55:23 +01:00
*/
struct ethnl_req_info {
struct net_device * dev ;
2021-12-06 17:30:37 -08:00
netdevice_tracker dev_tracker ;
2019-12-27 15:55:23 +01:00
u32 flags ;
} ;
2019-12-27 15:55:18 +01:00
2021-12-14 00:42:30 -08:00
static inline void ethnl_parse_header_dev_put ( struct ethnl_req_info * req_info )
{
2022-06-07 21:39:55 -07:00
netdev_put ( req_info - > dev , & req_info - > dev_tracker ) ;
2021-12-14 00:42:30 -08:00
}
2019-12-27 15:55:38 +01:00
/**
* struct ethnl_reply_data - base type of reply data for GET requests
* @ dev : device for current reply message ; in single shot requests it is
* equal to & ethnl_req_info . dev ; in dumps it ' s different for each
* reply message
*
* This is a common base for request specific structures holding data for
* kernel reply message . These always embed struct ethnl_reply_data at zero
* offset .
*/
struct ethnl_reply_data {
struct net_device * dev ;
} ;
2021-08-01 12:37:39 +02:00
int ethnl_ops_begin ( struct net_device * dev ) ;
void ethnl_ops_complete ( struct net_device * dev ) ;
2019-12-27 15:55:38 +01:00
/**
* struct ethnl_request_ops - unified handling of GET requests
* @ request_cmd : command id for request ( GET )
* @ reply_cmd : command id for reply ( GET_REPLY )
* @ hdr_attr : attribute type for request header
* @ req_info_size : size of request info
* @ reply_data_size : size of reply data
* @ allow_nodev_do : allow non - dump request with no device identification
* @ parse_request :
* Parse request except common header ( struct ethnl_req_info ) . Common
* header is already filled on entry , the rest up to @ repdata_offset
* is zero initialized . This callback should only modify type specific
* request info by parsed attributes from request message .
* @ prepare_data :
* Retrieve and prepare data needed to compose a reply message . Calls to
* ethtool_ops handlers are limited to this callback . Common reply data
* ( struct ethnl_reply_data ) is filled on entry , type specific part after
* it is zero initialized . This callback should only modify the type
* specific part of reply data . Device identification from struct
* ethnl_reply_data is to be used as for dump requests , it iterates
* through network devices while dev member of struct ethnl_req_info
* points to the device from client request .
* @ reply_size :
* Estimate reply message size . Returned value must be sufficient for
* message payload without common reply header . The callback may returned
* estimate higher than actual message size if exact calculation would
* not be worth the saved memory space .
* @ fill_reply :
* Fill reply message payload ( except for common header ) from reply data .
* The callback must not generate more payload than previously called
* - > reply_size ( ) estimated .
* @ cleanup_data :
* Optional cleanup called when reply data is no longer needed . Can be
* used e . g . to free any additional data structures outside the main
* structure which were allocated by - > prepare_data ( ) . When processing
* dump requests , - > cleanup ( ) is called for each message .
*
* Description of variable parts of GET request handling when using the
* unified infrastructure . When used , a pointer to an instance of this
* structure is to be added to & ethnl_default_requests array and generic
* handlers ethnl_default_doit ( ) , ethnl_default_dumpit ( ) ,
2019-12-27 15:55:58 +01:00
* ethnl_default_start ( ) and ethnl_default_done ( ) used in @ ethtool_genl_ops ;
* ethnl_default_notify ( ) can be used in @ ethnl_notify_handlers to send
* notifications of the corresponding type .
2019-12-27 15:55:38 +01:00
*/
struct ethnl_request_ops {
u8 request_cmd ;
u8 reply_cmd ;
u16 hdr_attr ;
unsigned int req_info_size ;
unsigned int reply_data_size ;
bool allow_nodev_do ;
int ( * parse_request ) ( struct ethnl_req_info * req_info ,
struct nlattr * * tb ,
struct netlink_ext_ack * extack ) ;
int ( * prepare_data ) ( const struct ethnl_req_info * req_info ,
struct ethnl_reply_data * reply_data ,
struct genl_info * info ) ;
int ( * reply_size ) ( const struct ethnl_req_info * req_info ,
const struct ethnl_reply_data * reply_data ) ;
int ( * fill_reply ) ( struct sk_buff * skb ,
const struct ethnl_req_info * req_info ,
const struct ethnl_reply_data * reply_data ) ;
void ( * cleanup_data ) ( struct ethnl_reply_data * reply_data ) ;
} ;
2019-12-27 15:55:43 +01:00
/* request handlers */
extern const struct ethnl_request_ops ethnl_strset_request_ops ;
2019-12-27 15:55:48 +01:00
extern const struct ethnl_request_ops ethnl_linkinfo_request_ops ;
2019-12-27 15:56:08 +01:00
extern const struct ethnl_request_ops ethnl_linkmodes_request_ops ;
2019-12-27 15:56:23 +01:00
extern const struct ethnl_request_ops ethnl_linkstate_request_ops ;
2020-01-26 23:11:04 +01:00
extern const struct ethnl_request_ops ethnl_debug_request_ops ;
2020-01-26 23:11:13 +01:00
extern const struct ethnl_request_ops ethnl_wol_request_ops ;
2020-03-12 21:07:48 +01:00
extern const struct ethnl_request_ops ethnl_features_request_ops ;
2020-03-12 21:08:08 +01:00
extern const struct ethnl_request_ops ethnl_privflags_request_ops ;
2020-03-12 21:08:23 +01:00
extern const struct ethnl_request_ops ethnl_rings_request_ops ;
2020-03-12 21:08:38 +01:00
extern const struct ethnl_request_ops ethnl_channels_request_ops ;
2020-03-28 00:01:08 +01:00
extern const struct ethnl_request_ops ethnl_coalesce_request_ops ;
2020-03-28 00:01:23 +01:00
extern const struct ethnl_request_ops ethnl_pause_request_ops ;
2020-03-28 00:01:38 +01:00
extern const struct ethnl_request_ops ethnl_eee_request_ops ;
2020-03-28 00:01:58 +01:00
extern const struct ethnl_request_ops ethnl_tsinfo_request_ops ;
2021-03-29 20:59:52 -07:00
extern const struct ethnl_request_ops ethnl_fec_request_ops ;
2021-04-09 11:06:34 +03:00
extern const struct ethnl_request_ops ethnl_module_eeprom_request_ops ;
2021-04-16 12:27:39 -07:00
extern const struct ethnl_request_ops ethnl_stats_request_ops ;
2021-06-30 16:11:56 +08:00
extern const struct ethnl_request_ops ethnl_phc_vclocks_request_ops ;
ethtool: Add ability to control transceiver modules' power mode
Add a pair of new ethtool messages, 'ETHTOOL_MSG_MODULE_SET' and
'ETHTOOL_MSG_MODULE_GET', that can be used to control transceiver
modules parameters and retrieve their status.
The first parameter to control is the power mode of the module. It is
only relevant for paged memory modules, as flat memory modules always
operate in low power mode.
When a paged memory module is in low power mode, its power consumption
is reduced to the minimum, the management interface towards the host is
available and the data path is deactivated.
User space can choose to put modules that are not currently in use in
low power mode and transition them to high power mode before putting the
associated ports administratively up. This is useful for user space that
favors reduced power consumption and lower temperatures over reduced
link up times. In QSFP-DD modules the transition from low power mode to
high power mode can take a few seconds and this transition is only
expected to get longer with future / more complex modules.
User space can control the power mode of the module via the power mode
policy attribute ('ETHTOOL_A_MODULE_POWER_MODE_POLICY'). Possible
values:
* high: Module is always in high power mode.
* auto: Module is transitioned by the host to high power mode when the
first port using it is put administratively up and to low power mode
when the last port using it is put administratively down.
The operational power mode of the module is available to user space via
the 'ETHTOOL_A_MODULE_POWER_MODE' attribute. The attribute is not
reported to user space when a module is not plugged-in.
The user API is designed to be generic enough so that it could be used
for modules with different memory maps (e.g., SFF-8636, CMIS).
The only implementation of the device driver API in this series is for a
MAC driver (mlxsw) where the module is controlled by the device's
firmware, but it is designed to be generic enough so that it could also
be used by implementations where the module is controlled by the CPU.
CMIS testing
============
# ethtool -m swp11
Identifier : 0x18 (QSFP-DD Double Density 8X Pluggable Transceiver (INF-8628))
...
Module State : 0x03 (ModuleReady)
LowPwrAllowRequestHW : Off
LowPwrRequestSW : Off
The module is not in low power mode, as it is not forced by hardware
(LowPwrAllowRequestHW is off) or by software (LowPwrRequestSW is off).
The power mode can be queried from the kernel. In case
LowPwrAllowRequestHW was on, the kernel would need to take into account
the state of the LowPwrRequestHW signal, which is not visible to user
space.
$ ethtool --show-module swp11
Module parameters for swp11:
power-mode-policy high
power-mode high
Change the power mode policy to 'auto':
# ethtool --set-module swp11 power-mode-policy auto
Query the power mode again:
$ ethtool --show-module swp11
Module parameters for swp11:
power-mode-policy auto
power-mode low
Verify with the data read from the EEPROM:
# ethtool -m swp11
Identifier : 0x18 (QSFP-DD Double Density 8X Pluggable Transceiver (INF-8628))
...
Module State : 0x01 (ModuleLowPwr)
LowPwrAllowRequestHW : Off
LowPwrRequestSW : On
Put the associated port administratively up which will instruct the host
to transition the module to high power mode:
# ip link set dev swp11 up
Query the power mode again:
$ ethtool --show-module swp11
Module parameters for swp11:
power-mode-policy auto
power-mode high
Verify with the data read from the EEPROM:
# ethtool -m swp11
Identifier : 0x18 (QSFP-DD Double Density 8X Pluggable Transceiver (INF-8628))
...
Module State : 0x03 (ModuleReady)
LowPwrAllowRequestHW : Off
LowPwrRequestSW : Off
Put the associated port administratively down which will instruct the
host to transition the module to low power mode:
# ip link set dev swp11 down
Query the power mode again:
$ ethtool --show-module swp11
Module parameters for swp11:
power-mode-policy auto
power-mode low
Verify with the data read from the EEPROM:
# ethtool -m swp11
Identifier : 0x18 (QSFP-DD Double Density 8X Pluggable Transceiver (INF-8628))
...
Module State : 0x01 (ModuleLowPwr)
LowPwrAllowRequestHW : Off
LowPwrRequestSW : On
SFF-8636 testing
================
# ethtool -m swp13
Identifier : 0x11 (QSFP28)
...
Extended identifier description : 5.0W max. Power consumption, High Power Class (> 3.5 W) enabled
Power set : Off
Power override : On
...
Transmit avg optical power (Channel 1) : 0.7733 mW / -1.12 dBm
Transmit avg optical power (Channel 2) : 0.7649 mW / -1.16 dBm
Transmit avg optical power (Channel 3) : 0.7790 mW / -1.08 dBm
Transmit avg optical power (Channel 4) : 0.7837 mW / -1.06 dBm
Rcvr signal avg optical power(Channel 1) : 0.9302 mW / -0.31 dBm
Rcvr signal avg optical power(Channel 2) : 0.9079 mW / -0.42 dBm
Rcvr signal avg optical power(Channel 3) : 0.8993 mW / -0.46 dBm
Rcvr signal avg optical power(Channel 4) : 0.8778 mW / -0.57 dBm
The module is not in low power mode, as it is not forced by hardware
(Power override is on) or by software (Power set is off).
The power mode can be queried from the kernel. In case Power override
was off, the kernel would need to take into account the state of the
LPMode signal, which is not visible to user space.
$ ethtool --show-module swp13
Module parameters for swp13:
power-mode-policy high
power-mode high
Change the power mode policy to 'auto':
# ethtool --set-module swp13 power-mode-policy auto
Query the power mode again:
$ ethtool --show-module swp13
Module parameters for swp13:
power-mode-policy auto
power-mode low
Verify with the data read from the EEPROM:
# ethtool -m swp13
Identifier : 0x11 (QSFP28)
Extended identifier description : 5.0W max. Power consumption, High Power Class (> 3.5 W) not enabled
Power set : On
Power override : On
...
Transmit avg optical power (Channel 1) : 0.0000 mW / -inf dBm
Transmit avg optical power (Channel 2) : 0.0000 mW / -inf dBm
Transmit avg optical power (Channel 3) : 0.0000 mW / -inf dBm
Transmit avg optical power (Channel 4) : 0.0000 mW / -inf dBm
Rcvr signal avg optical power(Channel 1) : 0.0000 mW / -inf dBm
Rcvr signal avg optical power(Channel 2) : 0.0000 mW / -inf dBm
Rcvr signal avg optical power(Channel 3) : 0.0000 mW / -inf dBm
Rcvr signal avg optical power(Channel 4) : 0.0000 mW / -inf dBm
Put the associated port administratively up which will instruct the host
to transition the module to high power mode:
# ip link set dev swp13 up
Query the power mode again:
$ ethtool --show-module swp13
Module parameters for swp13:
power-mode-policy auto
power-mode high
Verify with the data read from the EEPROM:
# ethtool -m swp13
Identifier : 0x11 (QSFP28)
...
Extended identifier description : 5.0W max. Power consumption, High Power Class (> 3.5 W) enabled
Power set : Off
Power override : On
...
Transmit avg optical power (Channel 1) : 0.7934 mW / -1.01 dBm
Transmit avg optical power (Channel 2) : 0.7859 mW / -1.05 dBm
Transmit avg optical power (Channel 3) : 0.7885 mW / -1.03 dBm
Transmit avg optical power (Channel 4) : 0.7985 mW / -0.98 dBm
Rcvr signal avg optical power(Channel 1) : 0.9325 mW / -0.30 dBm
Rcvr signal avg optical power(Channel 2) : 0.9034 mW / -0.44 dBm
Rcvr signal avg optical power(Channel 3) : 0.9086 mW / -0.42 dBm
Rcvr signal avg optical power(Channel 4) : 0.8885 mW / -0.51 dBm
Put the associated port administratively down which will instruct the
host to transition the module to low power mode:
# ip link set dev swp13 down
Query the power mode again:
$ ethtool --show-module swp13
Module parameters for swp13:
power-mode-policy auto
power-mode low
Verify with the data read from the EEPROM:
# ethtool -m swp13
Identifier : 0x11 (QSFP28)
...
Extended identifier description : 5.0W max. Power consumption, High Power Class (> 3.5 W) not enabled
Power set : On
Power override : On
...
Transmit avg optical power (Channel 1) : 0.0000 mW / -inf dBm
Transmit avg optical power (Channel 2) : 0.0000 mW / -inf dBm
Transmit avg optical power (Channel 3) : 0.0000 mW / -inf dBm
Transmit avg optical power (Channel 4) : 0.0000 mW / -inf dBm
Rcvr signal avg optical power(Channel 1) : 0.0000 mW / -inf dBm
Rcvr signal avg optical power(Channel 2) : 0.0000 mW / -inf dBm
Rcvr signal avg optical power(Channel 3) : 0.0000 mW / -inf dBm
Rcvr signal avg optical power(Channel 4) : 0.0000 mW / -inf dBm
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-10-06 13:46:42 +03:00
extern const struct ethnl_request_ops ethnl_module_request_ops ;
2022-10-03 08:52:00 +02:00
extern const struct ethnl_request_ops ethnl_pse_request_ops ;
2022-12-01 16:25:55 -08:00
extern const struct ethnl_request_ops ethnl_rss_request_ops ;
2023-01-09 17:59:39 +01:00
extern const struct ethnl_request_ops ethnl_plca_cfg_request_ops ;
extern const struct ethnl_request_ops ethnl_plca_status_request_ops ;
net: ethtool: add support for MAC Merge layer
The MAC merge sublayer (IEEE 802.3-2018 clause 99) is one of 2
specifications (the other being Frame Preemption; IEEE 802.1Q-2018
clause 6.7.2), which work together to minimize latency caused by frame
interference at TX. The overall goal of TSN is for normal traffic and
traffic with a bounded deadline to be able to cohabitate on the same L2
network and not bother each other too much.
The standards achieve this (partly) by introducing the concept of
preemptible traffic, i.e. Ethernet frames that have a custom value for
the Start-of-Frame-Delimiter (SFD), and these frames can be fragmented
and reassembled at L2 on a link-local basis. The non-preemptible frames
are called express traffic, they are transmitted using a normal SFD, and
they can preempt preemptible frames, therefore having lower latency,
which can matter at lower (100 Mbps) link speeds, or at high MTUs (jumbo
frames around 9K). Preemption is not recursive, i.e. a P frame cannot
preempt another P frame. Preemption also does not depend upon priority,
or otherwise said, an E frame with prio 0 will still preempt a P frame
with prio 7.
In terms of implementation, the standards talk about the presence of an
express MAC (eMAC) which handles express traffic, and a preemptible MAC
(pMAC) which handles preemptible traffic, and these MACs are multiplexed
on the same MII by a MAC merge layer.
To support frame preemption, the definition of the SFD was generalized
to SMD (Start-of-mPacket-Delimiter), where an mPacket is essentially an
Ethernet frame fragment, or a complete frame. Stations unaware of an SMD
value different from the standard SFD will treat P frames as error
frames. To prevent that from happening, a negotiation process is
defined.
On RX, packets are dispatched to the eMAC or pMAC after being filtered
by their SMD. On TX, the eMAC/pMAC classification decision is taken by
the 802.1Q spec, based on packet priority (each of the 8 user priority
values may have an admin-status of preemptible or express).
The MAC Merge layer and the Frame Preemption parameters have some degree
of independence in terms of how software stacks are supposed to deal
with them. The activation of the MM layer is supposed to be controlled
by an LLDP daemon (after it has been communicated that the link partner
also supports it), after which a (hardware-based or not) verification
handshake takes place, before actually enabling the feature. So the
process is intended to be relatively plug-and-play. Whereas FP settings
are supposed to be coordinated across a network using something
approximating NETCONF.
The support contained here is exclusively for the 802.3 (MAC Merge)
portions and not for the 802.1Q (Frame Preemption) parts. This API is
sufficient for an LLDP daemon to do its job. The FP adminStatus variable
from 802.1Q is outside the scope of an LLDP daemon.
I have taken a few creative licenses and augmented the Linux kernel UAPI
compared to the standard managed objects recommended by IEEE 802.3.
These are:
- ETHTOOL_A_MM_PMAC_ENABLED: According to Figure 99-6: Receive
Processing state diagram, a MAC Merge layer is always supposed to be
able to receive P frames. However, this implies keeping the pMAC
powered on, which will consume needless power in applications where FP
will never be used. If LLDP is used, the reception of an Additional
Ethernet Capabilities TLV from the link partner is sufficient
indication that the pMAC should be enabled. So my proposal is that in
Linux, we keep the pMAC turned off by default and that user space
turns it on when needed.
- ETHTOOL_A_MM_VERIFY_ENABLED: The IEEE managed object is called
aMACMergeVerifyDisableTx. I opted for consistency (positive logic) in
the boolean netlink attributes offered, so this is also positive here.
Other than the meaning being reversed, they correspond to the same
thing.
- ETHTOOL_A_MM_MAX_VERIFY_TIME: I found it most reasonable for a LLDP
daemon to maximize the verifyTime variable (delay between SMD-V
transmissions), to maximize its chances that the LP replies. IEEE says
that the verifyTime can range between 1 and 128 ms, but the NXP ENETC
stupidly keeps this variable in a 7 bit register, so the maximum
supported value is 127 ms. I could have chosen to hardcode this in the
LLDP daemon to a lower value, but why not let the kernel expose its
supported range directly.
- ETHTOOL_A_MM_TX_MIN_FRAG_SIZE: the standard managed object is called
aMACMergeAddFragSize, and expresses the "additional" fragment size
(on top of ETH_ZLEN), whereas this expresses the absolute value of the
fragment size.
- ETHTOOL_A_MM_RX_MIN_FRAG_SIZE: there doesn't appear to exist a managed
object mandated by the standard, but user space clearly needs to know
what is the minimum supported fragment size of our local receiver,
since LLDP must advertise a value no lower than that.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2023-01-19 14:26:54 +02:00
extern const struct ethnl_request_ops ethnl_mm_request_ops ;
2019-12-27 15:55:43 +01:00
2020-10-05 15:07:36 -07:00
extern const struct nla_policy ethnl_header_policy [ ETHTOOL_A_HEADER_FLAGS + 1 ] ;
2020-10-05 15:07:39 -07:00
extern const struct nla_policy ethnl_header_policy_stats [ ETHTOOL_A_HEADER_FLAGS + 1 ] ;
2020-10-07 12:53:50 +02:00
extern const struct nla_policy ethnl_strset_get_policy [ ETHTOOL_A_STRSET_COUNTS_ONLY + 1 ] ;
2020-10-05 15:07:35 -07:00
extern const struct nla_policy ethnl_linkinfo_get_policy [ ETHTOOL_A_LINKINFO_HEADER + 1 ] ;
extern const struct nla_policy ethnl_linkinfo_set_policy [ ETHTOOL_A_LINKINFO_TP_MDIX_CTRL + 1 ] ;
extern const struct nla_policy ethnl_linkmodes_get_policy [ ETHTOOL_A_LINKMODES_HEADER + 1 ] ;
2021-02-02 20:06:06 +02:00
extern const struct nla_policy ethnl_linkmodes_set_policy [ ETHTOOL_A_LINKMODES_LANES + 1 ] ;
2020-10-05 15:07:35 -07:00
extern const struct nla_policy ethnl_linkstate_get_policy [ ETHTOOL_A_LINKSTATE_HEADER + 1 ] ;
extern const struct nla_policy ethnl_debug_get_policy [ ETHTOOL_A_DEBUG_HEADER + 1 ] ;
extern const struct nla_policy ethnl_debug_set_policy [ ETHTOOL_A_DEBUG_MSGMASK + 1 ] ;
extern const struct nla_policy ethnl_wol_get_policy [ ETHTOOL_A_WOL_HEADER + 1 ] ;
extern const struct nla_policy ethnl_wol_set_policy [ ETHTOOL_A_WOL_SOPASS + 1 ] ;
extern const struct nla_policy ethnl_features_get_policy [ ETHTOOL_A_FEATURES_HEADER + 1 ] ;
extern const struct nla_policy ethnl_features_set_policy [ ETHTOOL_A_FEATURES_WANTED + 1 ] ;
extern const struct nla_policy ethnl_privflags_get_policy [ ETHTOOL_A_PRIVFLAGS_HEADER + 1 ] ;
extern const struct nla_policy ethnl_privflags_set_policy [ ETHTOOL_A_PRIVFLAGS_FLAGS + 1 ] ;
extern const struct nla_policy ethnl_rings_get_policy [ ETHTOOL_A_RINGS_HEADER + 1 ] ;
2022-04-12 10:01:19 +08:00
extern const struct nla_policy ethnl_rings_set_policy [ ETHTOOL_A_RINGS_TX_PUSH + 1 ] ;
2020-10-05 15:07:35 -07:00
extern const struct nla_policy ethnl_channels_get_policy [ ETHTOOL_A_CHANNELS_HEADER + 1 ] ;
extern const struct nla_policy ethnl_channels_set_policy [ ETHTOOL_A_CHANNELS_COMBINED_COUNT + 1 ] ;
extern const struct nla_policy ethnl_coalesce_get_policy [ ETHTOOL_A_COALESCE_HEADER + 1 ] ;
2021-08-20 15:35:17 +08:00
extern const struct nla_policy ethnl_coalesce_set_policy [ ETHTOOL_A_COALESCE_MAX + 1 ] ;
2023-01-19 14:26:56 +02:00
extern const struct nla_policy ethnl_pause_get_policy [ ETHTOOL_A_PAUSE_STATS_SRC + 1 ] ;
2020-10-05 15:07:35 -07:00
extern const struct nla_policy ethnl_pause_set_policy [ ETHTOOL_A_PAUSE_TX + 1 ] ;
extern const struct nla_policy ethnl_eee_get_policy [ ETHTOOL_A_EEE_HEADER + 1 ] ;
extern const struct nla_policy ethnl_eee_set_policy [ ETHTOOL_A_EEE_TX_LPI_TIMER + 1 ] ;
extern const struct nla_policy ethnl_tsinfo_get_policy [ ETHTOOL_A_TSINFO_HEADER + 1 ] ;
extern const struct nla_policy ethnl_cable_test_act_policy [ ETHTOOL_A_CABLE_TEST_HEADER + 1 ] ;
extern const struct nla_policy ethnl_cable_test_tdr_act_policy [ ETHTOOL_A_CABLE_TEST_TDR_CFG + 1 ] ;
extern const struct nla_policy ethnl_tunnel_info_get_policy [ ETHTOOL_A_TUNNEL_INFO_HEADER + 1 ] ;
2021-03-29 20:59:52 -07:00
extern const struct nla_policy ethnl_fec_get_policy [ ETHTOOL_A_FEC_HEADER + 1 ] ;
extern const struct nla_policy ethnl_fec_set_policy [ ETHTOOL_A_FEC_AUTO + 1 ] ;
2021-06-22 09:50:48 +03:00
extern const struct nla_policy ethnl_module_eeprom_get_policy [ ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS + 1 ] ;
2023-01-19 14:26:56 +02:00
extern const struct nla_policy ethnl_stats_get_policy [ ETHTOOL_A_STATS_SRC + 1 ] ;
2021-06-30 16:11:56 +08:00
extern const struct nla_policy ethnl_phc_vclocks_get_policy [ ETHTOOL_A_PHC_VCLOCKS_HEADER + 1 ] ;
ethtool: Add ability to control transceiver modules' power mode
Add a pair of new ethtool messages, 'ETHTOOL_MSG_MODULE_SET' and
'ETHTOOL_MSG_MODULE_GET', that can be used to control transceiver
modules parameters and retrieve their status.
The first parameter to control is the power mode of the module. It is
only relevant for paged memory modules, as flat memory modules always
operate in low power mode.
When a paged memory module is in low power mode, its power consumption
is reduced to the minimum, the management interface towards the host is
available and the data path is deactivated.
User space can choose to put modules that are not currently in use in
low power mode and transition them to high power mode before putting the
associated ports administratively up. This is useful for user space that
favors reduced power consumption and lower temperatures over reduced
link up times. In QSFP-DD modules the transition from low power mode to
high power mode can take a few seconds and this transition is only
expected to get longer with future / more complex modules.
User space can control the power mode of the module via the power mode
policy attribute ('ETHTOOL_A_MODULE_POWER_MODE_POLICY'). Possible
values:
* high: Module is always in high power mode.
* auto: Module is transitioned by the host to high power mode when the
first port using it is put administratively up and to low power mode
when the last port using it is put administratively down.
The operational power mode of the module is available to user space via
the 'ETHTOOL_A_MODULE_POWER_MODE' attribute. The attribute is not
reported to user space when a module is not plugged-in.
The user API is designed to be generic enough so that it could be used
for modules with different memory maps (e.g., SFF-8636, CMIS).
The only implementation of the device driver API in this series is for a
MAC driver (mlxsw) where the module is controlled by the device's
firmware, but it is designed to be generic enough so that it could also
be used by implementations where the module is controlled by the CPU.
CMIS testing
============
# ethtool -m swp11
Identifier : 0x18 (QSFP-DD Double Density 8X Pluggable Transceiver (INF-8628))
...
Module State : 0x03 (ModuleReady)
LowPwrAllowRequestHW : Off
LowPwrRequestSW : Off
The module is not in low power mode, as it is not forced by hardware
(LowPwrAllowRequestHW is off) or by software (LowPwrRequestSW is off).
The power mode can be queried from the kernel. In case
LowPwrAllowRequestHW was on, the kernel would need to take into account
the state of the LowPwrRequestHW signal, which is not visible to user
space.
$ ethtool --show-module swp11
Module parameters for swp11:
power-mode-policy high
power-mode high
Change the power mode policy to 'auto':
# ethtool --set-module swp11 power-mode-policy auto
Query the power mode again:
$ ethtool --show-module swp11
Module parameters for swp11:
power-mode-policy auto
power-mode low
Verify with the data read from the EEPROM:
# ethtool -m swp11
Identifier : 0x18 (QSFP-DD Double Density 8X Pluggable Transceiver (INF-8628))
...
Module State : 0x01 (ModuleLowPwr)
LowPwrAllowRequestHW : Off
LowPwrRequestSW : On
Put the associated port administratively up which will instruct the host
to transition the module to high power mode:
# ip link set dev swp11 up
Query the power mode again:
$ ethtool --show-module swp11
Module parameters for swp11:
power-mode-policy auto
power-mode high
Verify with the data read from the EEPROM:
# ethtool -m swp11
Identifier : 0x18 (QSFP-DD Double Density 8X Pluggable Transceiver (INF-8628))
...
Module State : 0x03 (ModuleReady)
LowPwrAllowRequestHW : Off
LowPwrRequestSW : Off
Put the associated port administratively down which will instruct the
host to transition the module to low power mode:
# ip link set dev swp11 down
Query the power mode again:
$ ethtool --show-module swp11
Module parameters for swp11:
power-mode-policy auto
power-mode low
Verify with the data read from the EEPROM:
# ethtool -m swp11
Identifier : 0x18 (QSFP-DD Double Density 8X Pluggable Transceiver (INF-8628))
...
Module State : 0x01 (ModuleLowPwr)
LowPwrAllowRequestHW : Off
LowPwrRequestSW : On
SFF-8636 testing
================
# ethtool -m swp13
Identifier : 0x11 (QSFP28)
...
Extended identifier description : 5.0W max. Power consumption, High Power Class (> 3.5 W) enabled
Power set : Off
Power override : On
...
Transmit avg optical power (Channel 1) : 0.7733 mW / -1.12 dBm
Transmit avg optical power (Channel 2) : 0.7649 mW / -1.16 dBm
Transmit avg optical power (Channel 3) : 0.7790 mW / -1.08 dBm
Transmit avg optical power (Channel 4) : 0.7837 mW / -1.06 dBm
Rcvr signal avg optical power(Channel 1) : 0.9302 mW / -0.31 dBm
Rcvr signal avg optical power(Channel 2) : 0.9079 mW / -0.42 dBm
Rcvr signal avg optical power(Channel 3) : 0.8993 mW / -0.46 dBm
Rcvr signal avg optical power(Channel 4) : 0.8778 mW / -0.57 dBm
The module is not in low power mode, as it is not forced by hardware
(Power override is on) or by software (Power set is off).
The power mode can be queried from the kernel. In case Power override
was off, the kernel would need to take into account the state of the
LPMode signal, which is not visible to user space.
$ ethtool --show-module swp13
Module parameters for swp13:
power-mode-policy high
power-mode high
Change the power mode policy to 'auto':
# ethtool --set-module swp13 power-mode-policy auto
Query the power mode again:
$ ethtool --show-module swp13
Module parameters for swp13:
power-mode-policy auto
power-mode low
Verify with the data read from the EEPROM:
# ethtool -m swp13
Identifier : 0x11 (QSFP28)
Extended identifier description : 5.0W max. Power consumption, High Power Class (> 3.5 W) not enabled
Power set : On
Power override : On
...
Transmit avg optical power (Channel 1) : 0.0000 mW / -inf dBm
Transmit avg optical power (Channel 2) : 0.0000 mW / -inf dBm
Transmit avg optical power (Channel 3) : 0.0000 mW / -inf dBm
Transmit avg optical power (Channel 4) : 0.0000 mW / -inf dBm
Rcvr signal avg optical power(Channel 1) : 0.0000 mW / -inf dBm
Rcvr signal avg optical power(Channel 2) : 0.0000 mW / -inf dBm
Rcvr signal avg optical power(Channel 3) : 0.0000 mW / -inf dBm
Rcvr signal avg optical power(Channel 4) : 0.0000 mW / -inf dBm
Put the associated port administratively up which will instruct the host
to transition the module to high power mode:
# ip link set dev swp13 up
Query the power mode again:
$ ethtool --show-module swp13
Module parameters for swp13:
power-mode-policy auto
power-mode high
Verify with the data read from the EEPROM:
# ethtool -m swp13
Identifier : 0x11 (QSFP28)
...
Extended identifier description : 5.0W max. Power consumption, High Power Class (> 3.5 W) enabled
Power set : Off
Power override : On
...
Transmit avg optical power (Channel 1) : 0.7934 mW / -1.01 dBm
Transmit avg optical power (Channel 2) : 0.7859 mW / -1.05 dBm
Transmit avg optical power (Channel 3) : 0.7885 mW / -1.03 dBm
Transmit avg optical power (Channel 4) : 0.7985 mW / -0.98 dBm
Rcvr signal avg optical power(Channel 1) : 0.9325 mW / -0.30 dBm
Rcvr signal avg optical power(Channel 2) : 0.9034 mW / -0.44 dBm
Rcvr signal avg optical power(Channel 3) : 0.9086 mW / -0.42 dBm
Rcvr signal avg optical power(Channel 4) : 0.8885 mW / -0.51 dBm
Put the associated port administratively down which will instruct the
host to transition the module to low power mode:
# ip link set dev swp13 down
Query the power mode again:
$ ethtool --show-module swp13
Module parameters for swp13:
power-mode-policy auto
power-mode low
Verify with the data read from the EEPROM:
# ethtool -m swp13
Identifier : 0x11 (QSFP28)
...
Extended identifier description : 5.0W max. Power consumption, High Power Class (> 3.5 W) not enabled
Power set : On
Power override : On
...
Transmit avg optical power (Channel 1) : 0.0000 mW / -inf dBm
Transmit avg optical power (Channel 2) : 0.0000 mW / -inf dBm
Transmit avg optical power (Channel 3) : 0.0000 mW / -inf dBm
Transmit avg optical power (Channel 4) : 0.0000 mW / -inf dBm
Rcvr signal avg optical power(Channel 1) : 0.0000 mW / -inf dBm
Rcvr signal avg optical power(Channel 2) : 0.0000 mW / -inf dBm
Rcvr signal avg optical power(Channel 3) : 0.0000 mW / -inf dBm
Rcvr signal avg optical power(Channel 4) : 0.0000 mW / -inf dBm
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-10-06 13:46:42 +03:00
extern const struct nla_policy ethnl_module_get_policy [ ETHTOOL_A_MODULE_HEADER + 1 ] ;
extern const struct nla_policy ethnl_module_set_policy [ ETHTOOL_A_MODULE_POWER_MODE_POLICY + 1 ] ;
2022-10-03 08:52:00 +02:00
extern const struct nla_policy ethnl_pse_get_policy [ ETHTOOL_A_PSE_HEADER + 1 ] ;
extern const struct nla_policy ethnl_pse_set_policy [ ETHTOOL_A_PSE_MAX + 1 ] ;
2022-12-01 16:25:55 -08:00
extern const struct nla_policy ethnl_rss_get_policy [ ETHTOOL_A_RSS_CONTEXT + 1 ] ;
2023-01-09 17:59:39 +01:00
extern const struct nla_policy ethnl_plca_get_cfg_policy [ ETHTOOL_A_PLCA_HEADER + 1 ] ;
extern const struct nla_policy ethnl_plca_set_cfg_policy [ ETHTOOL_A_PLCA_MAX + 1 ] ;
extern const struct nla_policy ethnl_plca_get_status_policy [ ETHTOOL_A_PLCA_HEADER + 1 ] ;
net: ethtool: add support for MAC Merge layer
The MAC merge sublayer (IEEE 802.3-2018 clause 99) is one of 2
specifications (the other being Frame Preemption; IEEE 802.1Q-2018
clause 6.7.2), which work together to minimize latency caused by frame
interference at TX. The overall goal of TSN is for normal traffic and
traffic with a bounded deadline to be able to cohabitate on the same L2
network and not bother each other too much.
The standards achieve this (partly) by introducing the concept of
preemptible traffic, i.e. Ethernet frames that have a custom value for
the Start-of-Frame-Delimiter (SFD), and these frames can be fragmented
and reassembled at L2 on a link-local basis. The non-preemptible frames
are called express traffic, they are transmitted using a normal SFD, and
they can preempt preemptible frames, therefore having lower latency,
which can matter at lower (100 Mbps) link speeds, or at high MTUs (jumbo
frames around 9K). Preemption is not recursive, i.e. a P frame cannot
preempt another P frame. Preemption also does not depend upon priority,
or otherwise said, an E frame with prio 0 will still preempt a P frame
with prio 7.
In terms of implementation, the standards talk about the presence of an
express MAC (eMAC) which handles express traffic, and a preemptible MAC
(pMAC) which handles preemptible traffic, and these MACs are multiplexed
on the same MII by a MAC merge layer.
To support frame preemption, the definition of the SFD was generalized
to SMD (Start-of-mPacket-Delimiter), where an mPacket is essentially an
Ethernet frame fragment, or a complete frame. Stations unaware of an SMD
value different from the standard SFD will treat P frames as error
frames. To prevent that from happening, a negotiation process is
defined.
On RX, packets are dispatched to the eMAC or pMAC after being filtered
by their SMD. On TX, the eMAC/pMAC classification decision is taken by
the 802.1Q spec, based on packet priority (each of the 8 user priority
values may have an admin-status of preemptible or express).
The MAC Merge layer and the Frame Preemption parameters have some degree
of independence in terms of how software stacks are supposed to deal
with them. The activation of the MM layer is supposed to be controlled
by an LLDP daemon (after it has been communicated that the link partner
also supports it), after which a (hardware-based or not) verification
handshake takes place, before actually enabling the feature. So the
process is intended to be relatively plug-and-play. Whereas FP settings
are supposed to be coordinated across a network using something
approximating NETCONF.
The support contained here is exclusively for the 802.3 (MAC Merge)
portions and not for the 802.1Q (Frame Preemption) parts. This API is
sufficient for an LLDP daemon to do its job. The FP adminStatus variable
from 802.1Q is outside the scope of an LLDP daemon.
I have taken a few creative licenses and augmented the Linux kernel UAPI
compared to the standard managed objects recommended by IEEE 802.3.
These are:
- ETHTOOL_A_MM_PMAC_ENABLED: According to Figure 99-6: Receive
Processing state diagram, a MAC Merge layer is always supposed to be
able to receive P frames. However, this implies keeping the pMAC
powered on, which will consume needless power in applications where FP
will never be used. If LLDP is used, the reception of an Additional
Ethernet Capabilities TLV from the link partner is sufficient
indication that the pMAC should be enabled. So my proposal is that in
Linux, we keep the pMAC turned off by default and that user space
turns it on when needed.
- ETHTOOL_A_MM_VERIFY_ENABLED: The IEEE managed object is called
aMACMergeVerifyDisableTx. I opted for consistency (positive logic) in
the boolean netlink attributes offered, so this is also positive here.
Other than the meaning being reversed, they correspond to the same
thing.
- ETHTOOL_A_MM_MAX_VERIFY_TIME: I found it most reasonable for a LLDP
daemon to maximize the verifyTime variable (delay between SMD-V
transmissions), to maximize its chances that the LP replies. IEEE says
that the verifyTime can range between 1 and 128 ms, but the NXP ENETC
stupidly keeps this variable in a 7 bit register, so the maximum
supported value is 127 ms. I could have chosen to hardcode this in the
LLDP daemon to a lower value, but why not let the kernel expose its
supported range directly.
- ETHTOOL_A_MM_TX_MIN_FRAG_SIZE: the standard managed object is called
aMACMergeAddFragSize, and expresses the "additional" fragment size
(on top of ETH_ZLEN), whereas this expresses the absolute value of the
fragment size.
- ETHTOOL_A_MM_RX_MIN_FRAG_SIZE: there doesn't appear to exist a managed
object mandated by the standard, but user space clearly needs to know
what is the minimum supported fragment size of our local receiver,
since LLDP must advertise a value no lower than that.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2023-01-19 14:26:54 +02:00
extern const struct nla_policy ethnl_mm_get_policy [ ETHTOOL_A_MM_HEADER + 1 ] ;
extern const struct nla_policy ethnl_mm_set_policy [ ETHTOOL_A_MM_MAX + 1 ] ;
2020-10-05 15:07:33 -07:00
2019-12-27 15:55:53 +01:00
int ethnl_set_linkinfo ( struct sk_buff * skb , struct genl_info * info ) ;
2019-12-27 15:56:13 +01:00
int ethnl_set_linkmodes ( struct sk_buff * skb , struct genl_info * info ) ;
2020-01-26 23:11:07 +01:00
int ethnl_set_debug ( struct sk_buff * skb , struct genl_info * info ) ;
2020-01-26 23:11:16 +01:00
int ethnl_set_wol ( struct sk_buff * skb , struct genl_info * info ) ;
2020-03-12 21:07:58 +01:00
int ethnl_set_features ( struct sk_buff * skb , struct genl_info * info ) ;
2020-03-12 21:08:13 +01:00
int ethnl_set_privflags ( struct sk_buff * skb , struct genl_info * info ) ;
2020-03-12 21:08:28 +01:00
int ethnl_set_rings ( struct sk_buff * skb , struct genl_info * info ) ;
2020-03-12 21:08:43 +01:00
int ethnl_set_channels ( struct sk_buff * skb , struct genl_info * info ) ;
2020-03-28 00:01:13 +01:00
int ethnl_set_coalesce ( struct sk_buff * skb , struct genl_info * info ) ;
2020-03-28 00:01:28 +01:00
int ethnl_set_pause ( struct sk_buff * skb , struct genl_info * info ) ;
2020-03-28 00:01:43 +01:00
int ethnl_set_eee ( struct sk_buff * skb , struct genl_info * info ) ;
2020-05-10 21:12:33 +02:00
int ethnl_act_cable_test ( struct sk_buff * skb , struct genl_info * info ) ;
2020-05-27 00:21:38 +02:00
int ethnl_act_cable_test_tdr ( struct sk_buff * skb , struct genl_info * info ) ;
2020-07-09 17:42:47 -07:00
int ethnl_tunnel_info_doit ( struct sk_buff * skb , struct genl_info * info ) ;
int ethnl_tunnel_info_start ( struct netlink_callback * cb ) ;
int ethnl_tunnel_info_dumpit ( struct sk_buff * skb , struct netlink_callback * cb ) ;
2021-03-29 20:59:52 -07:00
int ethnl_set_fec ( struct sk_buff * skb , struct genl_info * info ) ;
ethtool: Add ability to control transceiver modules' power mode
Add a pair of new ethtool messages, 'ETHTOOL_MSG_MODULE_SET' and
'ETHTOOL_MSG_MODULE_GET', that can be used to control transceiver
modules parameters and retrieve their status.
The first parameter to control is the power mode of the module. It is
only relevant for paged memory modules, as flat memory modules always
operate in low power mode.
When a paged memory module is in low power mode, its power consumption
is reduced to the minimum, the management interface towards the host is
available and the data path is deactivated.
User space can choose to put modules that are not currently in use in
low power mode and transition them to high power mode before putting the
associated ports administratively up. This is useful for user space that
favors reduced power consumption and lower temperatures over reduced
link up times. In QSFP-DD modules the transition from low power mode to
high power mode can take a few seconds and this transition is only
expected to get longer with future / more complex modules.
User space can control the power mode of the module via the power mode
policy attribute ('ETHTOOL_A_MODULE_POWER_MODE_POLICY'). Possible
values:
* high: Module is always in high power mode.
* auto: Module is transitioned by the host to high power mode when the
first port using it is put administratively up and to low power mode
when the last port using it is put administratively down.
The operational power mode of the module is available to user space via
the 'ETHTOOL_A_MODULE_POWER_MODE' attribute. The attribute is not
reported to user space when a module is not plugged-in.
The user API is designed to be generic enough so that it could be used
for modules with different memory maps (e.g., SFF-8636, CMIS).
The only implementation of the device driver API in this series is for a
MAC driver (mlxsw) where the module is controlled by the device's
firmware, but it is designed to be generic enough so that it could also
be used by implementations where the module is controlled by the CPU.
CMIS testing
============
# ethtool -m swp11
Identifier : 0x18 (QSFP-DD Double Density 8X Pluggable Transceiver (INF-8628))
...
Module State : 0x03 (ModuleReady)
LowPwrAllowRequestHW : Off
LowPwrRequestSW : Off
The module is not in low power mode, as it is not forced by hardware
(LowPwrAllowRequestHW is off) or by software (LowPwrRequestSW is off).
The power mode can be queried from the kernel. In case
LowPwrAllowRequestHW was on, the kernel would need to take into account
the state of the LowPwrRequestHW signal, which is not visible to user
space.
$ ethtool --show-module swp11
Module parameters for swp11:
power-mode-policy high
power-mode high
Change the power mode policy to 'auto':
# ethtool --set-module swp11 power-mode-policy auto
Query the power mode again:
$ ethtool --show-module swp11
Module parameters for swp11:
power-mode-policy auto
power-mode low
Verify with the data read from the EEPROM:
# ethtool -m swp11
Identifier : 0x18 (QSFP-DD Double Density 8X Pluggable Transceiver (INF-8628))
...
Module State : 0x01 (ModuleLowPwr)
LowPwrAllowRequestHW : Off
LowPwrRequestSW : On
Put the associated port administratively up which will instruct the host
to transition the module to high power mode:
# ip link set dev swp11 up
Query the power mode again:
$ ethtool --show-module swp11
Module parameters for swp11:
power-mode-policy auto
power-mode high
Verify with the data read from the EEPROM:
# ethtool -m swp11
Identifier : 0x18 (QSFP-DD Double Density 8X Pluggable Transceiver (INF-8628))
...
Module State : 0x03 (ModuleReady)
LowPwrAllowRequestHW : Off
LowPwrRequestSW : Off
Put the associated port administratively down which will instruct the
host to transition the module to low power mode:
# ip link set dev swp11 down
Query the power mode again:
$ ethtool --show-module swp11
Module parameters for swp11:
power-mode-policy auto
power-mode low
Verify with the data read from the EEPROM:
# ethtool -m swp11
Identifier : 0x18 (QSFP-DD Double Density 8X Pluggable Transceiver (INF-8628))
...
Module State : 0x01 (ModuleLowPwr)
LowPwrAllowRequestHW : Off
LowPwrRequestSW : On
SFF-8636 testing
================
# ethtool -m swp13
Identifier : 0x11 (QSFP28)
...
Extended identifier description : 5.0W max. Power consumption, High Power Class (> 3.5 W) enabled
Power set : Off
Power override : On
...
Transmit avg optical power (Channel 1) : 0.7733 mW / -1.12 dBm
Transmit avg optical power (Channel 2) : 0.7649 mW / -1.16 dBm
Transmit avg optical power (Channel 3) : 0.7790 mW / -1.08 dBm
Transmit avg optical power (Channel 4) : 0.7837 mW / -1.06 dBm
Rcvr signal avg optical power(Channel 1) : 0.9302 mW / -0.31 dBm
Rcvr signal avg optical power(Channel 2) : 0.9079 mW / -0.42 dBm
Rcvr signal avg optical power(Channel 3) : 0.8993 mW / -0.46 dBm
Rcvr signal avg optical power(Channel 4) : 0.8778 mW / -0.57 dBm
The module is not in low power mode, as it is not forced by hardware
(Power override is on) or by software (Power set is off).
The power mode can be queried from the kernel. In case Power override
was off, the kernel would need to take into account the state of the
LPMode signal, which is not visible to user space.
$ ethtool --show-module swp13
Module parameters for swp13:
power-mode-policy high
power-mode high
Change the power mode policy to 'auto':
# ethtool --set-module swp13 power-mode-policy auto
Query the power mode again:
$ ethtool --show-module swp13
Module parameters for swp13:
power-mode-policy auto
power-mode low
Verify with the data read from the EEPROM:
# ethtool -m swp13
Identifier : 0x11 (QSFP28)
Extended identifier description : 5.0W max. Power consumption, High Power Class (> 3.5 W) not enabled
Power set : On
Power override : On
...
Transmit avg optical power (Channel 1) : 0.0000 mW / -inf dBm
Transmit avg optical power (Channel 2) : 0.0000 mW / -inf dBm
Transmit avg optical power (Channel 3) : 0.0000 mW / -inf dBm
Transmit avg optical power (Channel 4) : 0.0000 mW / -inf dBm
Rcvr signal avg optical power(Channel 1) : 0.0000 mW / -inf dBm
Rcvr signal avg optical power(Channel 2) : 0.0000 mW / -inf dBm
Rcvr signal avg optical power(Channel 3) : 0.0000 mW / -inf dBm
Rcvr signal avg optical power(Channel 4) : 0.0000 mW / -inf dBm
Put the associated port administratively up which will instruct the host
to transition the module to high power mode:
# ip link set dev swp13 up
Query the power mode again:
$ ethtool --show-module swp13
Module parameters for swp13:
power-mode-policy auto
power-mode high
Verify with the data read from the EEPROM:
# ethtool -m swp13
Identifier : 0x11 (QSFP28)
...
Extended identifier description : 5.0W max. Power consumption, High Power Class (> 3.5 W) enabled
Power set : Off
Power override : On
...
Transmit avg optical power (Channel 1) : 0.7934 mW / -1.01 dBm
Transmit avg optical power (Channel 2) : 0.7859 mW / -1.05 dBm
Transmit avg optical power (Channel 3) : 0.7885 mW / -1.03 dBm
Transmit avg optical power (Channel 4) : 0.7985 mW / -0.98 dBm
Rcvr signal avg optical power(Channel 1) : 0.9325 mW / -0.30 dBm
Rcvr signal avg optical power(Channel 2) : 0.9034 mW / -0.44 dBm
Rcvr signal avg optical power(Channel 3) : 0.9086 mW / -0.42 dBm
Rcvr signal avg optical power(Channel 4) : 0.8885 mW / -0.51 dBm
Put the associated port administratively down which will instruct the
host to transition the module to low power mode:
# ip link set dev swp13 down
Query the power mode again:
$ ethtool --show-module swp13
Module parameters for swp13:
power-mode-policy auto
power-mode low
Verify with the data read from the EEPROM:
# ethtool -m swp13
Identifier : 0x11 (QSFP28)
...
Extended identifier description : 5.0W max. Power consumption, High Power Class (> 3.5 W) not enabled
Power set : On
Power override : On
...
Transmit avg optical power (Channel 1) : 0.0000 mW / -inf dBm
Transmit avg optical power (Channel 2) : 0.0000 mW / -inf dBm
Transmit avg optical power (Channel 3) : 0.0000 mW / -inf dBm
Transmit avg optical power (Channel 4) : 0.0000 mW / -inf dBm
Rcvr signal avg optical power(Channel 1) : 0.0000 mW / -inf dBm
Rcvr signal avg optical power(Channel 2) : 0.0000 mW / -inf dBm
Rcvr signal avg optical power(Channel 3) : 0.0000 mW / -inf dBm
Rcvr signal avg optical power(Channel 4) : 0.0000 mW / -inf dBm
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-10-06 13:46:42 +03:00
int ethnl_set_module ( struct sk_buff * skb , struct genl_info * info ) ;
2022-10-03 08:52:00 +02:00
int ethnl_set_pse ( struct sk_buff * skb , struct genl_info * info ) ;
2023-01-09 17:59:39 +01:00
int ethnl_set_plca_cfg ( struct sk_buff * skb , struct genl_info * info ) ;
net: ethtool: add support for MAC Merge layer
The MAC merge sublayer (IEEE 802.3-2018 clause 99) is one of 2
specifications (the other being Frame Preemption; IEEE 802.1Q-2018
clause 6.7.2), which work together to minimize latency caused by frame
interference at TX. The overall goal of TSN is for normal traffic and
traffic with a bounded deadline to be able to cohabitate on the same L2
network and not bother each other too much.
The standards achieve this (partly) by introducing the concept of
preemptible traffic, i.e. Ethernet frames that have a custom value for
the Start-of-Frame-Delimiter (SFD), and these frames can be fragmented
and reassembled at L2 on a link-local basis. The non-preemptible frames
are called express traffic, they are transmitted using a normal SFD, and
they can preempt preemptible frames, therefore having lower latency,
which can matter at lower (100 Mbps) link speeds, or at high MTUs (jumbo
frames around 9K). Preemption is not recursive, i.e. a P frame cannot
preempt another P frame. Preemption also does not depend upon priority,
or otherwise said, an E frame with prio 0 will still preempt a P frame
with prio 7.
In terms of implementation, the standards talk about the presence of an
express MAC (eMAC) which handles express traffic, and a preemptible MAC
(pMAC) which handles preemptible traffic, and these MACs are multiplexed
on the same MII by a MAC merge layer.
To support frame preemption, the definition of the SFD was generalized
to SMD (Start-of-mPacket-Delimiter), where an mPacket is essentially an
Ethernet frame fragment, or a complete frame. Stations unaware of an SMD
value different from the standard SFD will treat P frames as error
frames. To prevent that from happening, a negotiation process is
defined.
On RX, packets are dispatched to the eMAC or pMAC after being filtered
by their SMD. On TX, the eMAC/pMAC classification decision is taken by
the 802.1Q spec, based on packet priority (each of the 8 user priority
values may have an admin-status of preemptible or express).
The MAC Merge layer and the Frame Preemption parameters have some degree
of independence in terms of how software stacks are supposed to deal
with them. The activation of the MM layer is supposed to be controlled
by an LLDP daemon (after it has been communicated that the link partner
also supports it), after which a (hardware-based or not) verification
handshake takes place, before actually enabling the feature. So the
process is intended to be relatively plug-and-play. Whereas FP settings
are supposed to be coordinated across a network using something
approximating NETCONF.
The support contained here is exclusively for the 802.3 (MAC Merge)
portions and not for the 802.1Q (Frame Preemption) parts. This API is
sufficient for an LLDP daemon to do its job. The FP adminStatus variable
from 802.1Q is outside the scope of an LLDP daemon.
I have taken a few creative licenses and augmented the Linux kernel UAPI
compared to the standard managed objects recommended by IEEE 802.3.
These are:
- ETHTOOL_A_MM_PMAC_ENABLED: According to Figure 99-6: Receive
Processing state diagram, a MAC Merge layer is always supposed to be
able to receive P frames. However, this implies keeping the pMAC
powered on, which will consume needless power in applications where FP
will never be used. If LLDP is used, the reception of an Additional
Ethernet Capabilities TLV from the link partner is sufficient
indication that the pMAC should be enabled. So my proposal is that in
Linux, we keep the pMAC turned off by default and that user space
turns it on when needed.
- ETHTOOL_A_MM_VERIFY_ENABLED: The IEEE managed object is called
aMACMergeVerifyDisableTx. I opted for consistency (positive logic) in
the boolean netlink attributes offered, so this is also positive here.
Other than the meaning being reversed, they correspond to the same
thing.
- ETHTOOL_A_MM_MAX_VERIFY_TIME: I found it most reasonable for a LLDP
daemon to maximize the verifyTime variable (delay between SMD-V
transmissions), to maximize its chances that the LP replies. IEEE says
that the verifyTime can range between 1 and 128 ms, but the NXP ENETC
stupidly keeps this variable in a 7 bit register, so the maximum
supported value is 127 ms. I could have chosen to hardcode this in the
LLDP daemon to a lower value, but why not let the kernel expose its
supported range directly.
- ETHTOOL_A_MM_TX_MIN_FRAG_SIZE: the standard managed object is called
aMACMergeAddFragSize, and expresses the "additional" fragment size
(on top of ETH_ZLEN), whereas this expresses the absolute value of the
fragment size.
- ETHTOOL_A_MM_RX_MIN_FRAG_SIZE: there doesn't appear to exist a managed
object mandated by the standard, but user space clearly needs to know
what is the minimum supported fragment size of our local receiver,
since LLDP must advertise a value no lower than that.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2023-01-19 14:26:54 +02:00
int ethnl_set_mm ( struct sk_buff * skb , struct genl_info * info ) ;
2019-12-27 15:55:53 +01:00
2021-04-16 12:27:39 -07:00
extern const char stats_std_names [ __ETHTOOL_STATS_CNT ] [ ETH_GSTRING_LEN ] ;
extern const char stats_eth_phy_names [ __ETHTOOL_A_STATS_ETH_PHY_CNT ] [ ETH_GSTRING_LEN ] ;
2021-04-16 12:27:40 -07:00
extern const char stats_eth_mac_names [ __ETHTOOL_A_STATS_ETH_MAC_CNT ] [ ETH_GSTRING_LEN ] ;
2021-04-16 12:27:41 -07:00
extern const char stats_eth_ctrl_names [ __ETHTOOL_A_STATS_ETH_CTRL_CNT ] [ ETH_GSTRING_LEN ] ;
2021-04-16 12:27:42 -07:00
extern const char stats_rmon_names [ __ETHTOOL_A_STATS_RMON_CNT ] [ ETH_GSTRING_LEN ] ;
2021-04-16 12:27:39 -07:00
2019-12-27 15:55:18 +01:00
# endif /* _NET_ETHTOOL_NETLINK_H */