Refactor graceful manager, fix misused WaitGroup (#29738)

Follow #29629

(cherry picked from commit d08f4360c96e130e0454b76ecef9405f2bd312a1)
This commit is contained in:
coldWater 2024-03-15 18:59:11 +08:00 committed by Earl Warren
parent 1a17c39e00
commit 94c70c7753
No known key found for this signature in database
GPG Key ID: 0579CB2928A78A00
4 changed files with 55 additions and 51 deletions

View File

@ -233,7 +233,10 @@ func (g *Manager) setStateTransition(old, new state) bool {
// At the moment the total number of servers (numberOfServersToCreate) are pre-defined as a const before global init, // At the moment the total number of servers (numberOfServersToCreate) are pre-defined as a const before global init,
// so this function MUST be called if a server is not used. // so this function MUST be called if a server is not used.
func (g *Manager) InformCleanup() { func (g *Manager) InformCleanup() {
g.createServerWaitGroup.Done() g.createServerCond.L.Lock()
defer g.createServerCond.L.Unlock()
g.createdServer++
g.createServerCond.Signal()
} }
// Done allows the manager to be viewed as a context.Context, it returns a channel that is closed when the server is finished terminating // Done allows the manager to be viewed as a context.Context, it returns a channel that is closed when the server is finished terminating

View File

@ -42,8 +42,9 @@ type Manager struct {
terminateCtxCancel context.CancelFunc terminateCtxCancel context.CancelFunc
managerCtxCancel context.CancelFunc managerCtxCancel context.CancelFunc
runningServerWaitGroup sync.WaitGroup runningServerWaitGroup sync.WaitGroup
createServerWaitGroup sync.WaitGroup
terminateWaitGroup sync.WaitGroup terminateWaitGroup sync.WaitGroup
createServerCond sync.Cond
createdServer int
shutdownRequested chan struct{} shutdownRequested chan struct{}
toRunAtShutdown []func() toRunAtShutdown []func()
@ -52,7 +53,7 @@ type Manager struct {
func newGracefulManager(ctx context.Context) *Manager { func newGracefulManager(ctx context.Context) *Manager {
manager := &Manager{ctx: ctx, shutdownRequested: make(chan struct{})} manager := &Manager{ctx: ctx, shutdownRequested: make(chan struct{})}
manager.createServerWaitGroup.Add(numberOfServersToCreate) manager.createServerCond.L = &sync.Mutex{}
manager.prepare(ctx) manager.prepare(ctx)
manager.start() manager.start()
return manager return manager

View File

@ -57,20 +57,27 @@ func (g *Manager) start() {
// Handle clean up of unused provided listeners and delayed start-up // Handle clean up of unused provided listeners and delayed start-up
startupDone := make(chan struct{}) startupDone := make(chan struct{})
go func() { go func() {
defer close(startupDone) defer func() {
// Wait till we're done getting all the listeners and then close the unused ones close(startupDone)
func() { // Close the unused listeners and ignore the error here there's not much we can do with it, they're logged in the CloseProvidedListeners function
// FIXME: there is a fundamental design problem of the "manager" and the "wait group". _ = CloseProvidedListeners()
// If nothing has started, the "Wait" just panics: sync: WaitGroup is reused before previous Wait has returned
// There is no clear solution besides a complete rewriting of the "manager"
defer func() {
_ = recover()
}()
g.createServerWaitGroup.Wait()
}() }()
// Ignore the error here there's not much we can do with it, they're logged in the CloseProvidedListeners function // Wait for all servers to be created
_ = CloseProvidedListeners() g.createServerCond.L.Lock()
g.notify(readyMsg) for {
if g.createdServer >= numberOfServersToCreate {
g.createServerCond.L.Unlock()
g.notify(readyMsg)
return
}
select {
case <-g.IsShutdown():
g.createServerCond.L.Unlock()
return
default:
}
g.createServerCond.Wait()
}
}() }()
if setting.StartupTimeout > 0 { if setting.StartupTimeout > 0 {
go func() { go func() {
@ -78,16 +85,7 @@ func (g *Manager) start() {
case <-startupDone: case <-startupDone:
return return
case <-g.IsShutdown(): case <-g.IsShutdown():
func() { g.createServerCond.Signal()
// When WaitGroup counter goes negative it will panic - we don't care about this so we can just ignore it.
defer func() {
_ = recover()
}()
// Ensure that the createServerWaitGroup stops waiting
for {
g.createServerWaitGroup.Done()
}
}()
return return
case <-time.After(setting.StartupTimeout): case <-time.After(setting.StartupTimeout):
log.Error("Startup took too long! Shutting down") log.Error("Startup took too long! Shutting down")

View File

@ -149,33 +149,35 @@ hammerLoop:
func (g *Manager) awaitServer(limit time.Duration) bool { func (g *Manager) awaitServer(limit time.Duration) bool {
c := make(chan struct{}) c := make(chan struct{})
go func() { go func() {
defer close(c) g.createServerCond.L.Lock()
func() { for {
// FIXME: there is a fundamental design problem of the "manager" and the "wait group". if g.createdServer >= numberOfServersToCreate {
// If nothing has started, the "Wait" just panics: sync: WaitGroup is reused before previous Wait has returned g.createServerCond.L.Unlock()
// There is no clear solution besides a complete rewriting of the "manager" close(c)
defer func() { return
_ = recover() }
}() select {
g.createServerWaitGroup.Wait() case <-g.IsShutdown():
}() g.createServerCond.L.Unlock()
return
default:
}
g.createServerCond.Wait()
}
}() }()
var tc <-chan time.Time
if limit > 0 { if limit > 0 {
select { tc = time.After(limit)
case <-c: }
return true // completed normally select {
case <-time.After(limit): case <-c:
return false // timed out return true // completed normally
case <-g.IsShutdown(): case <-tc:
return false return false // timed out
} case <-g.IsShutdown():
} else { g.createServerCond.Signal()
select { return false
case <-c:
return true // completed normally
case <-g.IsShutdown():
return false
}
} }
} }