mlxsw: core: Introduce fw_fatal health reporter
authorJiri Pirko <jiri@nvidia.com>
Tue, 15 Sep 2020 08:40:58 +0000 (11:40 +0300)
committerDavid S. Miller <davem@davemloft.net>
Tue, 15 Sep 2020 22:57:16 +0000 (15:57 -0700)
Introduce devlink health reporter to report FW fatal events. Implement
the event listener using MFDE trap and enable the events to be
propagated using MFGD register configuration.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
drivers/net/ethernet/mellanox/mlxsw/core.c
drivers/net/ethernet/mellanox/mlxsw/core.h
drivers/net/ethernet/mellanox/mlxsw/reg.h
drivers/net/ethernet/mellanox/mlxsw/spectrum.c
drivers/net/ethernet/mellanox/mlxsw/trap.h

index 7b5939e068d1f88d4a96b18d2b881ab5f23ca24c..1bb21fe295b9ef36b91c6bff7d7ad1df217ca8f4 100644 (file)
@@ -84,6 +84,9 @@ struct mlxsw_core {
        struct mlxsw_core_port *ports;
        unsigned int max_ports;
        bool fw_flash_in_progress;
+       struct {
+               struct devlink_health_reporter *fw_fatal;
+       } health;
        unsigned long driver_priv[];
        /* driver_priv has to be always the last item */
 };
@@ -1612,6 +1615,236 @@ static void mlxsw_core_params_unregister(struct mlxsw_core *mlxsw_core)
                mlxsw_core->driver->params_unregister(mlxsw_core);
 }
 
+struct mlxsw_core_health_event {
+       struct mlxsw_core *mlxsw_core;
+       char mfde_pl[MLXSW_REG_MFDE_LEN];
+       struct work_struct work;
+};
+
+static void mlxsw_core_health_event_work(struct work_struct *work)
+{
+       struct mlxsw_core_health_event *event;
+       struct mlxsw_core *mlxsw_core;
+
+       event = container_of(work, struct mlxsw_core_health_event, work);
+       mlxsw_core = event->mlxsw_core;
+       devlink_health_report(mlxsw_core->health.fw_fatal, "FW fatal event occurred",
+                             event->mfde_pl);
+       kfree(event);
+}
+
+static void mlxsw_core_health_listener_func(const struct mlxsw_reg_info *reg,
+                                           char *mfde_pl, void *priv)
+{
+       struct mlxsw_core_health_event *event;
+       struct mlxsw_core *mlxsw_core = priv;
+
+       event = kmalloc(sizeof(*event), GFP_ATOMIC);
+       if (!event)
+               return;
+       event->mlxsw_core = mlxsw_core;
+       memcpy(event->mfde_pl, mfde_pl, sizeof(event->mfde_pl));
+       INIT_WORK(&event->work, mlxsw_core_health_event_work);
+       mlxsw_core_schedule_work(&event->work);
+}
+
+static const struct mlxsw_listener mlxsw_core_health_listener =
+       MLXSW_EVENTL(mlxsw_core_health_listener_func, MFDE, MFDE);
+
+static int mlxsw_core_health_fw_fatal_dump(struct devlink_health_reporter *reporter,
+                                          struct devlink_fmsg *fmsg, void *priv_ctx,
+                                          struct netlink_ext_ack *extack)
+{
+       char *mfde_pl = priv_ctx;
+       char *val_str;
+       u8 event_id;
+       u32 val;
+       int err;
+
+       if (!priv_ctx)
+               /* User-triggered dumps are not possible */
+               return -EOPNOTSUPP;
+
+       val = mlxsw_reg_mfde_irisc_id_get(mfde_pl);
+       err = devlink_fmsg_u8_pair_put(fmsg, "irisc_id", val);
+       if (err)
+               return err;
+       err = devlink_fmsg_arr_pair_nest_start(fmsg, "event");
+       if (err)
+               return err;
+
+       event_id = mlxsw_reg_mfde_event_id_get(mfde_pl);
+       err = devlink_fmsg_u8_pair_put(fmsg, "id", event_id);
+       if (err)
+               return err;
+       switch (event_id) {
+       case MLXSW_REG_MFDE_EVENT_ID_CRSPACE_TO:
+               val_str = "CR space timeout";
+               break;
+       case MLXSW_REG_MFDE_EVENT_ID_KVD_IM_STOP:
+               val_str = "KVD insertion machine stopped";
+               break;
+       default:
+               val_str = NULL;
+       }
+       if (val_str) {
+               err = devlink_fmsg_string_pair_put(fmsg, "desc", val_str);
+               if (err)
+                       return err;
+       }
+       err = devlink_fmsg_arr_pair_nest_end(fmsg);
+       if (err)
+               return err;
+
+       val = mlxsw_reg_mfde_method_get(mfde_pl);
+       switch (val) {
+       case MLXSW_REG_MFDE_METHOD_QUERY:
+               val_str = "query";
+               break;
+       case MLXSW_REG_MFDE_METHOD_WRITE:
+               val_str = "write";
+               break;
+       default:
+               val_str = NULL;
+       }
+       if (val_str) {
+               err = devlink_fmsg_string_pair_put(fmsg, "method", val_str);
+               if (err)
+                       return err;
+       }
+
+       val = mlxsw_reg_mfde_long_process_get(mfde_pl);
+       err = devlink_fmsg_bool_pair_put(fmsg, "long_process", val);
+       if (err)
+               return err;
+
+       val = mlxsw_reg_mfde_command_type_get(mfde_pl);
+       switch (val) {
+       case MLXSW_REG_MFDE_COMMAND_TYPE_MAD:
+               val_str = "mad";
+               break;
+       case MLXSW_REG_MFDE_COMMAND_TYPE_EMAD:
+               val_str = "emad";
+               break;
+       case MLXSW_REG_MFDE_COMMAND_TYPE_CMDIF:
+               val_str = "cmdif";
+               break;
+       default:
+               val_str = NULL;
+       }
+       if (val_str) {
+               err = devlink_fmsg_string_pair_put(fmsg, "command_type", val_str);
+               if (err)
+                       return err;
+       }
+
+       val = mlxsw_reg_mfde_reg_attr_id_get(mfde_pl);
+       err = devlink_fmsg_u32_pair_put(fmsg, "reg_attr_id", val);
+       if (err)
+               return err;
+
+       if (event_id == MLXSW_REG_MFDE_EVENT_ID_CRSPACE_TO) {
+               val = mlxsw_reg_mfde_log_address_get(mfde_pl);
+               err = devlink_fmsg_u32_pair_put(fmsg, "log_address", val);
+               if (err)
+                       return err;
+               val = mlxsw_reg_mfde_log_id_get(mfde_pl);
+               err = devlink_fmsg_u8_pair_put(fmsg, "log_irisc_id", val);
+               if (err)
+                       return err;
+       } else if (event_id == MLXSW_REG_MFDE_EVENT_ID_KVD_IM_STOP) {
+               val = mlxsw_reg_mfde_pipes_mask_get(mfde_pl);
+               err = devlink_fmsg_u32_pair_put(fmsg, "pipes_mask", val);
+               if (err)
+                       return err;
+       }
+
+       return 0;
+}
+
+static int
+mlxsw_core_health_fw_fatal_test(struct devlink_health_reporter *reporter,
+                               struct netlink_ext_ack *extack)
+{
+       struct mlxsw_core *mlxsw_core = devlink_health_reporter_priv(reporter);
+       char mfgd_pl[MLXSW_REG_MFGD_LEN];
+       int err;
+
+       /* Read the register first to make sure no other bits are changed. */
+       err = mlxsw_reg_query(mlxsw_core, MLXSW_REG(mfgd), mfgd_pl);
+       if (err)
+               return err;
+       mlxsw_reg_mfgd_trigger_test_set(mfgd_pl, true);
+       return mlxsw_reg_write(mlxsw_core, MLXSW_REG(mfgd), mfgd_pl);
+}
+
+static const struct devlink_health_reporter_ops
+mlxsw_core_health_fw_fatal_ops = {
+       .name = "fw_fatal",
+       .dump = mlxsw_core_health_fw_fatal_dump,
+       .test = mlxsw_core_health_fw_fatal_test,
+};
+
+static int mlxsw_core_health_fw_fatal_config(struct mlxsw_core *mlxsw_core,
+                                            bool enable)
+{
+       char mfgd_pl[MLXSW_REG_MFGD_LEN];
+       int err;
+
+       /* Read the register first to make sure no other bits are changed. */
+       err = mlxsw_reg_query(mlxsw_core, MLXSW_REG(mfgd), mfgd_pl);
+       if (err)
+               return err;
+       mlxsw_reg_mfgd_fatal_event_mode_set(mfgd_pl, enable);
+       return mlxsw_reg_write(mlxsw_core, MLXSW_REG(mfgd), mfgd_pl);
+}
+
+static int mlxsw_core_health_init(struct mlxsw_core *mlxsw_core)
+{
+       struct devlink *devlink = priv_to_devlink(mlxsw_core);
+       struct devlink_health_reporter *fw_fatal;
+       int err;
+
+       if (!mlxsw_core->driver->fw_fatal_enabled)
+               return 0;
+
+       fw_fatal = devlink_health_reporter_create(devlink, &mlxsw_core_health_fw_fatal_ops,
+                                                 0, mlxsw_core);
+       if (IS_ERR(fw_fatal)) {
+               dev_err(mlxsw_core->bus_info->dev, "Failed to create fw fatal reporter");
+               return PTR_ERR(fw_fatal);
+       }
+       mlxsw_core->health.fw_fatal = fw_fatal;
+
+       err = mlxsw_core_trap_register(mlxsw_core, &mlxsw_core_health_listener, mlxsw_core);
+       if (err)
+               goto err_trap_register;
+
+       err = mlxsw_core_health_fw_fatal_config(mlxsw_core, true);
+       if (err)
+               goto err_fw_fatal_config;
+
+       return 0;
+
+err_fw_fatal_config:
+       mlxsw_core_trap_unregister(mlxsw_core, &mlxsw_core_health_listener, mlxsw_core);
+err_trap_register:
+       devlink_health_reporter_destroy(mlxsw_core->health.fw_fatal);
+       return err;
+}
+
+static void mlxsw_core_health_fini(struct mlxsw_core *mlxsw_core)
+{
+       if (!mlxsw_core->driver->fw_fatal_enabled)
+               return;
+
+       mlxsw_core_health_fw_fatal_config(mlxsw_core, false);
+       mlxsw_core_trap_unregister(mlxsw_core, &mlxsw_core_health_listener, mlxsw_core);
+       /* Make sure there is no more event work scheduled */
+       mlxsw_core_flush_owq();
+       devlink_health_reporter_destroy(mlxsw_core->health.fw_fatal);
+}
+
 static int
 __mlxsw_core_bus_device_register(const struct mlxsw_bus_info *mlxsw_bus_info,
                                 const struct mlxsw_bus *mlxsw_bus,
@@ -1695,6 +1928,10 @@ __mlxsw_core_bus_device_register(const struct mlxsw_bus_info *mlxsw_bus_info,
        if (err)
                goto err_fw_rev_validate;
 
+       err = mlxsw_core_health_init(mlxsw_core);
+       if (err)
+               goto err_health_init;
+
        if (mlxsw_driver->init) {
                err = mlxsw_driver->init(mlxsw_core, mlxsw_bus_info, extack);
                if (err)
@@ -1723,6 +1960,8 @@ err_hwmon_init:
        if (mlxsw_core->driver->fini)
                mlxsw_core->driver->fini(mlxsw_core);
 err_driver_init:
+       mlxsw_core_health_fini(mlxsw_core);
+err_health_init:
 err_fw_rev_validate:
        if (!reload)
                mlxsw_core_params_unregister(mlxsw_core);
@@ -1795,6 +2034,7 @@ void mlxsw_core_bus_device_unregister(struct mlxsw_core *mlxsw_core,
        mlxsw_hwmon_fini(mlxsw_core->hwmon);
        if (mlxsw_core->driver->fini)
                mlxsw_core->driver->fini(mlxsw_core);
+       mlxsw_core_health_fini(mlxsw_core);
        if (!reload)
                mlxsw_core_params_unregister(mlxsw_core);
        if (!reload)
index 6ec76990663702eaacfcdbf437b2b2c006ea5722..2ca085a44774d880ca49cb7945c5041b92f04440 100644 (file)
@@ -370,6 +370,7 @@ struct mlxsw_driver {
        u8 txhdr_len;
        const struct mlxsw_config_profile *profile;
        bool res_query_enabled;
+       bool fw_fatal_enabled;
 };
 
 int mlxsw_core_kvd_sizes_get(struct mlxsw_core *mlxsw_core,
index 421f02eac20f9ad526a2dda6ef555b956b32af5a..6e3d55006089acc152a1064444c9004c91c65fc8 100644 (file)
@@ -5579,6 +5579,7 @@ MLXSW_ITEM32(reg, htgt, type, 0x00, 8, 4);
 
 enum mlxsw_reg_htgt_trap_group {
        MLXSW_REG_HTGT_TRAP_GROUP_EMAD,
+       MLXSW_REG_HTGT_TRAP_GROUP_MFDE,
        MLXSW_REG_HTGT_TRAP_GROUP_SP_STP,
        MLXSW_REG_HTGT_TRAP_GROUP_SP_LACP,
        MLXSW_REG_HTGT_TRAP_GROUP_SP_LLDP,
index 18d2eacfae83d3b8b0db22f7ac835e17bda4b4b5..351d385158e63f6ecf848b6f6990410c578f1944 100644 (file)
@@ -2529,11 +2529,20 @@ static void mlxsw_sp_lag_fini(struct mlxsw_sp *mlxsw_sp)
 static int mlxsw_sp_basic_trap_groups_set(struct mlxsw_core *mlxsw_core)
 {
        char htgt_pl[MLXSW_REG_HTGT_LEN];
+       int err;
 
        mlxsw_reg_htgt_pack(htgt_pl, MLXSW_REG_HTGT_TRAP_GROUP_EMAD,
                            MLXSW_REG_HTGT_INVALID_POLICER,
                            MLXSW_REG_HTGT_DEFAULT_PRIORITY,
                            MLXSW_REG_HTGT_DEFAULT_TC);
+       err =  mlxsw_reg_write(mlxsw_core, MLXSW_REG(htgt), htgt_pl);
+       if (err)
+               return err;
+
+       mlxsw_reg_htgt_pack(htgt_pl, MLXSW_REG_HTGT_TRAP_GROUP_MFDE,
+                           MLXSW_REG_HTGT_INVALID_POLICER,
+                           MLXSW_REG_HTGT_DEFAULT_PRIORITY,
+                           MLXSW_REG_HTGT_DEFAULT_TC);
        return mlxsw_reg_write(mlxsw_core, MLXSW_REG(htgt), htgt_pl);
 }
 
@@ -3287,6 +3296,7 @@ static struct mlxsw_driver mlxsw_sp1_driver = {
        .txhdr_len                      = MLXSW_TXHDR_LEN,
        .profile                        = &mlxsw_sp1_config_profile,
        .res_query_enabled              = true,
+       .fw_fatal_enabled               = true,
 };
 
 static struct mlxsw_driver mlxsw_sp2_driver = {
@@ -3326,6 +3336,7 @@ static struct mlxsw_driver mlxsw_sp2_driver = {
        .txhdr_len                      = MLXSW_TXHDR_LEN,
        .profile                        = &mlxsw_sp2_config_profile,
        .res_query_enabled              = true,
+       .fw_fatal_enabled               = true,
 };
 
 static struct mlxsw_driver mlxsw_sp3_driver = {
@@ -3365,6 +3376,7 @@ static struct mlxsw_driver mlxsw_sp3_driver = {
        .txhdr_len                      = MLXSW_TXHDR_LEN,
        .profile                        = &mlxsw_sp2_config_profile,
        .res_query_enabled              = true,
+       .fw_fatal_enabled               = true,
 };
 
 bool mlxsw_sp_port_dev_check(const struct net_device *dev)
index 33909887d0ac432d007494263e4cdb95adc86939..fe0b8af287a7b5ea6ed0f9e8b4e3dbda0904bf46 100644 (file)
@@ -120,6 +120,8 @@ enum {
 };
 
 enum mlxsw_event_trap_id {
+       /* Fatal Event generated by FW */
+       MLXSW_TRAP_ID_MFDE = 0x3,
        /* Port Up/Down event generated by hardware */
        MLXSW_TRAP_ID_PUDE = 0x8,
        /* PTP Ingress FIFO has a new entry */