Skip to content

Commit

Permalink
Log collector broken pipe fix (#2267)
Browse files Browse the repository at this point in the history
  • Loading branch information
kevinclark19a authored Jun 17, 2021
1 parent ca5a788 commit 9da99cc
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 35 deletions.
21 changes: 21 additions & 0 deletions azurelinuxagent/common/future.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import contextlib
import platform
import sys
import os
Expand Down Expand Up @@ -35,6 +36,10 @@
from collections import OrderedDict # pylint: disable=W0611
from queue import Queue, Empty # pylint: disable=W0611,import-error

# unused-import<W0611> Disabled: python2.7 doesn't have subprocess.DEVNULL
# so this import is only used by python3.
import subprocess # pylint: disable=unused-import

elif sys.version_info[0] == 2:
import httplib as httpclient # pylint: disable=E0401,W0611
from urlparse import urlparse # pylint: disable=E0401
Expand Down Expand Up @@ -143,6 +148,22 @@ def is_file_not_found_error(exception):

return isinstance(exception, FileNotFoundError)

@contextlib.contextmanager
def subprocess_dev_null():

if sys.version_info[0] == 3:
# Suppress no-member errors on python2.7
yield subprocess.DEVNULL # pylint: disable=no-member
else:
try:
devnull = open(os.devnull, "a+")
yield devnull
except Exception:
yield None
finally:
if devnull is not None:
devnull.close()

def array_to_bytes(buff):
# Python 3.9 removed the tostring() method on arrays, the new alias is tobytes()
if sys.version_info[0] == 2:
Expand Down
81 changes: 47 additions & 34 deletions azurelinuxagent/ga/collect_logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
import azurelinuxagent.common.conf as conf
from azurelinuxagent.common import logger
from azurelinuxagent.common.event import elapsed_milliseconds, add_event, WALAEventOperation
from azurelinuxagent.common.future import ustr
from azurelinuxagent.common.future import subprocess_dev_null, ustr
from azurelinuxagent.common.interfaces import ThreadHandlerInterface
from azurelinuxagent.common.logcollector import COMPRESSED_ARCHIVE_PATH
from azurelinuxagent.common.osutil import systemd
Expand Down Expand Up @@ -161,41 +161,54 @@ def _collect_logs():
collect_logs_cmd = [sys.executable, "-u", sys.argv[0], "-collect-logs"]
final_command = systemd_cmd + resource_limits + collect_logs_cmd

start_time = datetime.datetime.utcnow()
success = False
msg = None
def exec_command(output_file):
start_time = datetime.datetime.utcnow()
success = False
msg = None
try:
# TODO: Remove track_process (and its implementation) when the log collector is moved to the agent's cgroup
shellutil.run_command(final_command, log_error=False, track_process=False,
stdout=output_file, stderr=output_file)
duration = elapsed_milliseconds(start_time)
archive_size = os.path.getsize(COMPRESSED_ARCHIVE_PATH)

msg = "Successfully collected logs. Archive size: {0} b, elapsed time: {1} ms.".format(archive_size,
duration)
logger.info(msg)
success = True

return True
except Exception as e:
duration = elapsed_milliseconds(start_time)

if isinstance(e, CommandError):
exception_message = ustr("[stderr] %s", e.stderr) # pylint: disable=no-member
else:
exception_message = ustr(e)

msg = "Failed to collect logs. Elapsed time: {0} ms. Error: {1}".format(duration, exception_message)
# No need to log to the local log since we ran run_command with logging errors as enabled

return False
finally:
add_event(
name=AGENT_NAME,
version=CURRENT_VERSION,
op=WALAEventOperation.LogCollection,
is_success=success,
message=msg,
log_event=False)

try:
# TODO: Remove track_process (and its implementation) when the log collector is moved to the agent's cgroup
shellutil.run_command(final_command, log_error=True, track_process=False)
duration = elapsed_milliseconds(start_time)
archive_size = os.path.getsize(COMPRESSED_ARCHIVE_PATH)

msg = "Successfully collected logs. Archive size: {0} b, elapsed time: {1} ms.".format(archive_size,
duration)
logger.info(msg)
success = True

return True
except Exception as e:
duration = elapsed_milliseconds(start_time)

if isinstance(e, CommandError):
exception_message = ustr("[stderr] %s", e.stderr) # pylint: disable=no-member
else:
exception_message = ustr(e)

msg = "Failed to collect logs. Elapsed time: {0} ms. Error: {1}".format(duration, exception_message)
# No need to log to the local log since we ran run_command with logging errors as enabled

return False
logfile = open(conf.get_agent_log_file(), "a+")
except Exception:
with subprocess_dev_null() as DEVNULL:
return exec_command(DEVNULL)
else:
return exec_command(logfile)
finally:
add_event(
name=AGENT_NAME,
version=CURRENT_VERSION,
op=WALAEventOperation.LogCollection,
is_success=success,
message=msg,
log_event=False)
if logfile is not None:
logfile.close()

def _send_logs(self):
msg = None
Expand Down
9 changes: 8 additions & 1 deletion tests/ga/test_collect_logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import contextlib
import os

from azurelinuxagent.common import logger
from azurelinuxagent.common import logger, conf
from azurelinuxagent.common.logger import Logger
from azurelinuxagent.common.protocol.util import ProtocolUtil
from azurelinuxagent.ga.collect_logs import get_collect_logs_handler, is_log_collection_allowed
Expand Down Expand Up @@ -69,6 +69,10 @@ def setUp(self):
self.mock_archive_path = patch("azurelinuxagent.ga.collect_logs.COMPRESSED_ARCHIVE_PATH", self.archive_path)
self.mock_archive_path.start()

self.logger_path = os.path.join(self.tmp_dir, "waagent.log")
self.mock_logger_path = patch.object(conf, "get_agent_log_file", return_value=self.logger_path)
self.mock_logger_path.start()

# Since ProtocolUtil is a singleton per thread, we need to clear it to ensure that the test cases do not
# reuse a previous state
clear_singleton_instances(ProtocolUtil)
Expand All @@ -77,6 +81,9 @@ def tearDown(self):
if os.path.exists(self.archive_path):
os.remove(self.archive_path)
self.mock_archive_path.stop()
if os.path.exists(self.logger_path):
os.remove(self.logger_path)
self.mock_logger_path.stop()
AgentTestCase.tearDown(self)

def _create_dummy_archive(self, size=1024):
Expand Down

0 comments on commit 9da99cc

Please sign in to comment.