From bc3a9d217755f65c137f145600f23bf1d6c31ea9 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Wed, 20 Aug 2025 14:56:50 -0500 Subject: [PATCH] ipmi:si: Gracefully handle if the BMC is non-functional If the BMC is not functional, the driver goes into an error state and starts a 1 second timer. When the timer times out, it will attempt a simple message. If the BMC interacts correctly, the driver will start accepting messages again. If not, it remains in error state. If the driver goes into error state, all messages current and pending will return with an error. This should more gracefully handle when the BMC becomes non-operational, as opposed to trying each messages individually and failing them. Signed-off-by: Corey Minyard --- drivers/char/ipmi/ipmi_si_intf.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c index 88d62ef71b1b..1c65275906b4 100644 --- a/drivers/char/ipmi/ipmi_si_intf.c +++ b/drivers/char/ipmi/ipmi_si_intf.c @@ -53,6 +53,7 @@ #define SI_TIMEOUT_JIFFIES (SI_TIMEOUT_TIME_USEC/SI_USEC_PER_JIFFY) #define SI_SHORT_TIMEOUT_USEC 250 /* .25ms when the SM request a short timeout */ +#define SI_TIMEOUT_HOSED (HZ) /* 1 second when in hosed state. */ enum si_intf_state { SI_NORMAL, @@ -61,7 +62,8 @@ enum si_intf_state { SI_CLEARING_FLAGS, SI_GETTING_MESSAGES, SI_CHECKING_ENABLES, - SI_SETTING_ENABLES + SI_SETTING_ENABLES, + SI_HOSED /* FIXME - add watchdog stuff. */ }; @@ -753,6 +755,8 @@ static void handle_transaction_done(struct smi_info *smi_info) } break; } + case SI_HOSED: /* Shouldn't happen. */ + break; } } @@ -767,6 +771,10 @@ static enum si_sm_result smi_event_handler(struct smi_info *smi_info, enum si_sm_result si_sm_result; restart: + if (smi_info->si_state == SI_HOSED) + /* Just in case, hosed state is only left from the timeout. */ + return SI_SM_HOSED; + /* * There used to be a loop here that waited a little while * (around 25us) before giving up. That turned out to be @@ -790,18 +798,20 @@ restart: /* * Do the before return_hosed_msg, because that - * releases the lock. + * releases the lock. We just disable operations for + * a while and retry in hosed state. */ - smi_info->si_state = SI_NORMAL; + smi_info->si_state = SI_HOSED; if (smi_info->curr_msg != NULL) { /* * If we were handling a user message, format * a response to send to the upper layer to * tell it about the error. */ - return_hosed_msg(smi_info, IPMI_ERR_UNSPECIFIED); + return_hosed_msg(smi_info, IPMI_BUS_ERR); } - goto restart; + smi_mod_timer(smi_info, jiffies + SI_TIMEOUT_HOSED); + goto out; } /* @@ -899,7 +909,7 @@ static void flush_messages(void *send_info) * mode. This means we are single-threaded, no need for locks. */ result = smi_event_handler(smi_info, 0); - while (result != SI_SM_IDLE) { + while (result != SI_SM_IDLE && result != SI_SM_HOSED) { udelay(SI_SHORT_TIMEOUT_USEC); result = smi_event_handler(smi_info, SI_SHORT_TIMEOUT_USEC); } @@ -912,6 +922,9 @@ static int sender(void *send_info, struct ipmi_smi_msg *msg) debug_timestamp(smi_info, "Enqueue"); + if (smi_info->si_state == SI_HOSED) + return IPMI_BUS_ERR; + if (smi_info->run_to_completion) { /* * If we are running to completion, start it. Upper @@ -1092,6 +1105,10 @@ static void smi_timeout(struct timer_list *t) spin_lock_irqsave(&(smi_info->si_lock), flags); debug_timestamp(smi_info, "Timer"); + if (smi_info->si_state == SI_HOSED) + /* Try something to see if the BMC is now operational. */ + start_get_flags(smi_info); + jiffies_now = jiffies; time_diff = (((long)jiffies_now - (long)smi_info->last_timeout_jiffies) * SI_USEC_PER_JIFFY); -- 2.20.1