remove deadlock in GetOrHandshake (#1151)

We had a rare deadlock in GetOrHandshake because we kept the hostmap
lock when we do the call to StartHandshake. StartHandshake can block
while sending to the lighthouse query worker channel, and that worker
needs to be able to grab the hostmap lock to do its work. Other calls
for StartHandshake don't hold the hostmap lock so we should be able to
drop it here.

This lock was originally added with: https://github.com/slackhq/nebula/pull/954
This commit is contained in:
Wade Simmons 2024-05-29 12:52:52 -04:00 committed by GitHub
parent 50b24c102e
commit 4eb1da0958
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 4 additions and 4 deletions

View File

@ -356,10 +356,11 @@ func (hm *HandshakeManager) handleOutbound(vpnIp iputil.VpnIp, lighthouseTrigger
// GetOrHandshake will try to find a hostinfo with a fully formed tunnel or start a new handshake if one is not present
// The 2nd argument will be true if the hostinfo is ready to transmit traffic
func (hm *HandshakeManager) GetOrHandshake(vpnIp iputil.VpnIp, cacheCb func(*HandshakeHostInfo)) (*HostInfo, bool) {
// Check the main hostmap and maintain a read lock if our host is not there
hm.mainHostMap.RLock()
if h, ok := hm.mainHostMap.Hosts[vpnIp]; ok {
hm.mainHostMap.RUnlock()
h, ok := hm.mainHostMap.Hosts[vpnIp]
hm.mainHostMap.RUnlock()
if ok {
// Do not attempt promotion if you are a lighthouse
if !hm.lightHouse.amLighthouse {
h.TryPromoteBest(hm.mainHostMap.GetPreferredRanges(), hm.f)
@ -367,7 +368,6 @@ func (hm *HandshakeManager) GetOrHandshake(vpnIp iputil.VpnIp, cacheCb func(*Han
return h, true
}
defer hm.mainHostMap.RUnlock()
return hm.StartHandshake(vpnIp, cacheCb), false
}