Recover gRPC channel from connection dropouts. (#1406)

- check and recover channel connection in `TryRecovery()`
- fixes an infinite loop in the unlimited retry strategy by adding `grpc::UNAVAILABLE`,
  `DEADLINE_EXCEEDED` to the unrecoverable status codes.
  - server restart leads to `UNAVAILABLE`
  - connection loss leads to `DEADLINE_EXCEEDED`
- fixes repeated recovery attempts
master
Michael Grupp 2018-09-04 11:17:51 +02:00 committed by Wally B. Feed
parent 153952ddf0
commit b1dfa30ee3
1 changed files with 22 additions and 1 deletions

View File

@ -37,13 +37,18 @@ namespace {
using absl::make_unique;
constexpr int kConnectionTimeoutInSeconds = 10;
constexpr int kConnectionRecoveryTimeoutInSeconds = 60;
constexpr int kTokenRefreshIntervalInSeconds = 60;
const common::Duration kPopTimeout = common::FromMilliseconds(100);
// This defines the '::grpc::StatusCode's that are considered unrecoverable
// errors and hence no retries will be attempted by the client.
const std::set<::grpc::StatusCode> kUnrecoverableStatusCodes = {
::grpc::NOT_FOUND};
::grpc::DEADLINE_EXCEEDED,
::grpc::NOT_FOUND,
::grpc::UNAVAILABLE,
::grpc::UNKNOWN,
};
bool IsNewSubmap(const mapping::proto::Submap& submap) {
return (submap.has_submap_2d() && submap.submap_2d().num_range_data() == 1) ||
@ -151,6 +156,19 @@ void LocalTrajectoryUploader::Shutdown() {
}
void LocalTrajectoryUploader::TryRecovery() {
if (client_channel_->GetState(false /* try_to_connect */) !=
grpc_connectivity_state::GRPC_CHANNEL_READY) {
LOG(INFO) << "Trying to re-connect to uplink...";
std::chrono::system_clock::time_point deadline =
std::chrono::system_clock::now() +
std::chrono::seconds(kConnectionRecoveryTimeoutInSeconds);
if (!client_channel_->WaitForConnected(deadline)) {
LOG(ERROR) << "Failed to re-connect to uplink prior to recovery attempt.";
return;
}
}
LOG(INFO) << "Uplink channel ready, trying recovery.";
// Wind the sensor_data_queue forward to the next new submap.
LOG(INFO) << "LocalTrajectoryUploader tries to recover with next submap.";
while (true) {
@ -184,6 +202,9 @@ void LocalTrajectoryUploader::TryRecovery() {
if (!status.ok()) {
LOG(ERROR) << "Failed to create trajectory. Aborting recovery attempt. "
<< status.error_message();
// Restore the previous state for the next recovery attempt.
local_trajectory_id_to_trajectory_info_ =
local_trajectory_id_to_trajectory_info;
return;
}
}