Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions communication/src/coap_channel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ bool CoAPMessageStore::retransmit(CoAPMessage* msg, Channel& channel, system_tic

void CoAPMessageStore::message_timeout(CoAPMessage& msg, Channel& channel)
{
LOG(TRACE, "timeout waiting for ACK for message id=%x", msg.get_id());
g_unacknowledgedMessageCounter++;
msg.notify_timeout();
if (msg.is_request())
Expand Down Expand Up @@ -99,7 +100,7 @@ ProtocolError CoAPMessageStore::send(Message& msg, system_tick_t time)
if (coapType==CoAPType::CON)
coapmsg->prepare_retransmit(time);
else
coapmsg->set_expiration(time+CoAPMessage::MAX_TRANSMIT_SPAN);
coapmsg->set_expiration(time + CoAPMessage::EXCHANGE_LIFETIME);
add(*coapmsg);
}
return NO_ERROR;
Expand Down Expand Up @@ -150,7 +151,7 @@ ProtocolError CoAPMessageStore::receive(Message& msg, Channel& channel, system_t
return INSUFFICIENT_STORAGE;
// the timeout here is ideally purely academic since the application will respond immediately with an ACK/RESET
// which will be stored in place of this message, with it's own timeout.
coapmsg->set_expiration(time+CoAPMessage::MAX_TRANSMIT_SPAN);
coapmsg->set_expiration(time + CoAPMessage::EXCHANGE_LIFETIME);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The exchange lifetime is much larger than the max transmit span, meaning we will have to keep many more messages in memory so that their responses can be resent on retransmission from the server. Just something to thing about - we should ideally run memory profiling over a busy CoAP connection with very slow acknowledgements and see how much memory is required.

Expecting an application to wait some 300 seconds (5 minutes) for a response seems unrealistic. Reducing the MAX_LATENCY from 100s to 5-10s seems reasonable given our use case. and would reduce the EXCHANGE_LIFETIME to something more manageable.

Going forward, I propose that we aim for shorter timeouts and more frequent retransmits, which we can have as a negotiation step between the device and the server. That way we can tailor the timeouts to the network characteristics, data cost, and the application needs.

add(*coapmsg);
}
}
Expand Down
87 changes: 65 additions & 22 deletions communication/src/coap_channel.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ namespace particle
namespace protocol
{

inline bool time_has_passed(system_tick_t now, system_tick_t tick);

/**
* Decorates a MessageChannel with message ID management as required by CoAP.
* When a message is sent that doesn't have an assigned ID, it is assigned the next available ID.
Expand Down Expand Up @@ -110,6 +112,19 @@ class __attribute__((packed)) CoAPMessage
*/
system_tick_t timeout;

/**
* The timestamp when the message was first sent.
*/
system_tick_t first_sent_timestamp;

/**
* Exponentially increasing timeout value (not a timestamp).
* Initially is set to a random value between
* [ACK_TIMEOUT, ACK_TIMEOUT * ACK_RANDOM_FACTOR / ACK_RANDOM_DIVISOR]
* Doubled on every retransmission.
*/
system_tick_t timeout_value;

/**
* The unique 16-bit ID for this message.
*/
Expand Down Expand Up @@ -149,20 +164,34 @@ class __attribute__((packed)) CoAPMessage

public:

static const uint16_t ACK_TIMEOUT = 4000;
static const uint16_t ACK_RANDOM_FACTOR = 1500;
static const uint16_t ACK_RANDOM_DIVISOR = 1000;
static const uint8_t MAX_RETRANSMIT = 3;
static const uint16_t MAX_TRANSMIT_SPAN = 45*1000;

// RFC7252 4.8. Transmission Parameters
// We are NOT using the default RFC values
static constexpr const auto ACK_TIMEOUT = 4000;
static constexpr const auto ACK_RANDOM_FACTOR = 1500;
static constexpr const auto ACK_RANDOM_DIVISOR = 1000;
static constexpr const auto MAX_RETRANSMIT = 3;
// RFC7252 4.8.2. Time Values Derived from Transmission Parameters
static constexpr const auto MAX_TRANSMIT_SPAN = (ACK_TIMEOUT * ((1 << MAX_RETRANSMIT) - 1) * ACK_RANDOM_FACTOR) / ACK_RANDOM_DIVISOR;
static constexpr const auto MAX_TRANSMIT_WAIT = (ACK_TIMEOUT * ((1 << (MAX_RETRANSMIT + 1)) - 1) * ACK_RANDOM_FACTOR) / ACK_RANDOM_DIVISOR;
static constexpr const auto MAX_LATENCY = 100000;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

100s is not realistic for our use case.

static constexpr const auto PROCESSING_DELAY = ACK_TIMEOUT;
static constexpr const auto EXCHANGE_LIFETIME = MAX_TRANSMIT_SPAN + (2 * MAX_LATENCY) + PROCESSING_DELAY;

/**
* The number of outstanding messages allowed.
*/
static const uint8_t NSTART = 1;
static constexpr const auto NSTART = 1;


CoAPMessage(message_id_t id_) : next(nullptr), timeout(0), id(id_), transmit_count(0), delivered(nullptr), data_len(0) {
CoAPMessage(message_id_t id_) :
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please be sure the clang_format file has these formatting changes.

next(nullptr),
timeout(0),
first_sent_timestamp(0),
timeout_value(0),
id(id_),
transmit_count(0),
delivered(nullptr),
data_len(0) {
message_count++;
}

Expand Down Expand Up @@ -196,6 +225,8 @@ class __attribute__((packed)) CoAPMessage
inline message_id_t get_id() const { return id; }
inline void removed() { next = nullptr; }
inline system_tick_t get_timeout() const { return timeout; }
inline system_tick_t get_timeout_value() const { return timeout_value; }
inline system_tick_t get_transmit_count() const { return transmit_count; }

inline void set_delivered_handler(std::function<void(Delivery)>* handler) { this->delivered = handler; }

Expand All @@ -218,25 +249,37 @@ class __attribute__((packed)) CoAPMessage
bool prepare_retransmit(system_tick_t now)
{
CoAPType::Enum coapType = CoAP::type(get_data());
if (coapType==CoAPType::CON) {
timeout = now + transmit_timeout(transmit_count);
transmit_count++;
return transmit_count <= MAX_RETRANSMIT+1;
if (coapType == CoAPType::CON) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While this technically is closer to the spec (taking the timeout with a pre-applied random factor and doubling it vs doubling the timeout and then applying a random factor), the difference in practice is not significant, especially considering the device service uses an entirely different set of parameters. Both approaches are within the operational constraints of how long the server will wait since the maximum timeout ends up being the same.

I'd like to drive this change from observed problems rather than from a strict adherence to the spec since this is a non-trivial change, so I want to be sure the gain outweighs the risk.

In terms of gain, I appreciate the inline references to the spec, which adds clarity. Also welcome other voices here about which is easiest to comprehend since that is a key factor here too. I am on the fence, but can imagine others might find this easier to grasp.

if (transmit_count != 0) {
// RFC7252 4.2. Messages Transmitted Reliably
// When the timeout is triggered and the retransmission counter is less than
// MAX_RETRANSMIT, the message is retransmitted, the retransmission
// counter is incremented, and the timeout is doubled.
++transmit_count;
timeout_value *= 2;
} else {
// RFC7252 4.2. Messages Transmitted Reliably
// For a new Confirmable message, the initial timeout is set
// to a random duration (often not an integral number of seconds)
// between ACK_TIMEOUT and (ACK_TIMEOUT * ACK_RANDOM_FACTOR)
// and the retransmission counter is set to 0.
timeout_value = ACK_TIMEOUT + (rand() % ((ACK_TIMEOUT * ACK_RANDOM_FACTOR) / ACK_RANDOM_DIVISOR));
first_sent_timestamp = now;
}
timeout = now + timeout_value;
// RFC7252 4.2. Messages Transmitted Reliably
// The entire sequence of (re-)transmissions MUST stay in the envelope of MAX_TRANSMIT_SPAN
if (transmit_count <= MAX_RETRANSMIT &&
!time_has_passed(now, (first_sent_timestamp + MAX_TRANSMIT_SPAN))) {
LOG(TRACE, "(re)txing message id=%x attempt=%d timeout=%d", get_id(),
transmit_count, timeout_value);
return true;
}
}
// other message types are not resent on timeout
return false;
}

/**
* Determines the transmit timeout for the given transmission count.
*/
static inline system_tick_t transmit_timeout(uint8_t transmit_count)
{
system_tick_t timeout = (ACK_TIMEOUT << transmit_count);
timeout += ((timeout * (rand()%256))>>9);
return timeout;
}

inline CoAPType::Enum get_type() const
{
return data_len>0 ? CoAP::type(data) : CoAPType::ERROR;
Expand Down
9 changes: 7 additions & 2 deletions communication/src/dtls_protocol.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,13 @@ void DTLSProtocol::init(const char *id,
{
set_protocol_flags(0);
memcpy(device_id, id, sizeof(device_id));
// send a ping once every 23 minutes
initialize_ping(23*60*1000,30000);
// Sets ping interval by default to 23 minutes. This is Electron-specific default
// value, however all the other platforms will override this currently depending
// on various conditions.
// IMPORTANT: the second argument is the timeout waiting for the ping
// acknowledgment. This SHOULD be set to MAX_TRANSMIT_WAIT, which is 93s
// for default transmission parameters defined in RFC7252. If required we might lower it.
initialize_ping(23 * 60 * 1000, CoAPMessage::MAX_TRANSMIT_WAIT);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bravo, I never liked the arbitrary choice of 30s. Ideally these magic values and the CoAP retries and timeouts should be parametrizable from the system layer.

DTLSMessageChannel::Callbacks channelCallbacks = {0};
channelCallbacks.millis = callbacks.millis;
channelCallbacks.handle_seed = handle_seed;
Expand Down