Merge pull request #5499 from matrix-org/rav/cleanup_metrics
Cleanups and sanity-checking in cpu and db metrics
This commit is contained in:
commit
e59a8cd2e5
|
@ -0,0 +1 @@
|
|||
Some cleanups and sanity-checking in the CPU and database metrics.
|
|
@ -175,22 +175,22 @@ class PerformanceCounters(object):
|
|||
self.current_counters = {}
|
||||
self.previous_counters = {}
|
||||
|
||||
def update(self, key, start_time, end_time=None):
|
||||
if end_time is None:
|
||||
end_time = time.time()
|
||||
duration = end_time - start_time
|
||||
def update(self, key, duration_secs):
|
||||
count, cum_time = self.current_counters.get(key, (0, 0))
|
||||
count += 1
|
||||
cum_time += duration
|
||||
cum_time += duration_secs
|
||||
self.current_counters[key] = (count, cum_time)
|
||||
return end_time
|
||||
|
||||
def interval(self, interval_duration, limit=3):
|
||||
def interval(self, interval_duration_secs, limit=3):
|
||||
counters = []
|
||||
for name, (count, cum_time) in iteritems(self.current_counters):
|
||||
prev_count, prev_time = self.previous_counters.get(name, (0, 0))
|
||||
counters.append(
|
||||
((cum_time - prev_time) / interval_duration, count - prev_count, name)
|
||||
(
|
||||
(cum_time - prev_time) / interval_duration_secs,
|
||||
count - prev_count,
|
||||
name,
|
||||
)
|
||||
)
|
||||
|
||||
self.previous_counters = dict(self.current_counters)
|
||||
|
@ -221,7 +221,6 @@ class SQLBaseStore(object):
|
|||
# is running in mainline, and we have some nice monitoring frontends
|
||||
# to watch it
|
||||
self._txn_perf_counters = PerformanceCounters()
|
||||
self._get_event_counters = PerformanceCounters()
|
||||
|
||||
self._get_event_cache = Cache(
|
||||
"*getEvent*", keylen=3, max_entries=hs.config.event_cache_size
|
||||
|
@ -369,21 +368,13 @@ class SQLBaseStore(object):
|
|||
time_then = self._previous_loop_ts
|
||||
self._previous_loop_ts = time_now
|
||||
|
||||
ratio = (curr - prev) / (time_now - time_then)
|
||||
duration = time_now - time_then
|
||||
ratio = (curr - prev) / duration
|
||||
|
||||
top_three_counters = self._txn_perf_counters.interval(
|
||||
time_now - time_then, limit=3
|
||||
)
|
||||
|
||||
top_3_event_counters = self._get_event_counters.interval(
|
||||
time_now - time_then, limit=3
|
||||
)
|
||||
top_three_counters = self._txn_perf_counters.interval(duration, limit=3)
|
||||
|
||||
perf_logger.info(
|
||||
"Total database time: %.3f%% {%s} {%s}",
|
||||
ratio * 100,
|
||||
top_three_counters,
|
||||
top_3_event_counters,
|
||||
"Total database time: %.3f%% {%s}", ratio * 100, top_three_counters
|
||||
)
|
||||
|
||||
self._clock.looping_call(loop, 10000)
|
||||
|
@ -465,7 +456,7 @@ class SQLBaseStore(object):
|
|||
transaction_logger.debug("[TXN END] {%s} %f sec", name, duration)
|
||||
|
||||
self._current_txn_total_time += duration
|
||||
self._txn_perf_counters.update(desc, start, end)
|
||||
self._txn_perf_counters.update(desc, duration)
|
||||
sql_txn_timer.labels(desc).observe(duration)
|
||||
|
||||
@defer.inlineCallbacks
|
||||
|
|
|
@ -336,10 +336,9 @@ class LoggingContext(object):
|
|||
logger.warning("Called stop on logcontext %s without calling start", self)
|
||||
return
|
||||
|
||||
usage_end = get_thread_resource_usage()
|
||||
|
||||
self._resource_usage.ru_utime += usage_end.ru_utime - self.usage_start.ru_utime
|
||||
self._resource_usage.ru_stime += usage_end.ru_stime - self.usage_start.ru_stime
|
||||
utime_delta, stime_delta = self._get_cputime()
|
||||
self._resource_usage.ru_utime += utime_delta
|
||||
self._resource_usage.ru_stime += stime_delta
|
||||
|
||||
self.usage_start = None
|
||||
|
||||
|
@ -357,13 +356,44 @@ class LoggingContext(object):
|
|||
# can include resource usage so far.
|
||||
is_main_thread = threading.current_thread() is self.main_thread
|
||||
if self.alive and self.usage_start and is_main_thread:
|
||||
current = get_thread_resource_usage()
|
||||
res.ru_utime += current.ru_utime - self.usage_start.ru_utime
|
||||
res.ru_stime += current.ru_stime - self.usage_start.ru_stime
|
||||
utime_delta, stime_delta = self._get_cputime()
|
||||
res.ru_utime += utime_delta
|
||||
res.ru_stime += stime_delta
|
||||
|
||||
return res
|
||||
|
||||
def _get_cputime(self):
|
||||
"""Get the cpu usage time so far
|
||||
|
||||
Returns: Tuple[float, float]: seconds in user mode, seconds in system mode
|
||||
"""
|
||||
current = get_thread_resource_usage()
|
||||
|
||||
utime_delta = current.ru_utime - self.usage_start.ru_utime
|
||||
stime_delta = current.ru_stime - self.usage_start.ru_stime
|
||||
|
||||
# sanity check
|
||||
if utime_delta < 0:
|
||||
logger.error(
|
||||
"utime went backwards! %f < %f",
|
||||
current.ru_utime,
|
||||
self.usage_start.ru_utime,
|
||||
)
|
||||
utime_delta = 0
|
||||
|
||||
if stime_delta < 0:
|
||||
logger.error(
|
||||
"stime went backwards! %f < %f",
|
||||
current.ru_stime,
|
||||
self.usage_start.ru_stime,
|
||||
)
|
||||
stime_delta = 0
|
||||
|
||||
return utime_delta, stime_delta
|
||||
|
||||
def add_database_transaction(self, duration_sec):
|
||||
if duration_sec < 0:
|
||||
raise ValueError("DB txn time can only be non-negative")
|
||||
self._resource_usage.db_txn_count += 1
|
||||
self._resource_usage.db_txn_duration_sec += duration_sec
|
||||
|
||||
|
@ -374,6 +404,8 @@ class LoggingContext(object):
|
|||
sched_sec (float): number of seconds it took us to get a
|
||||
connection
|
||||
"""
|
||||
if sched_sec < 0:
|
||||
raise ValueError("DB scheduling time can only be non-negative")
|
||||
self._resource_usage.db_sched_duration_sec += sched_sec
|
||||
|
||||
def record_event_fetch(self, event_count):
|
||||
|
|
Loading…
Reference in New Issue