Skip to content

Commit bc3bda9

Browse files
Merge pull request #8615 from PoornimaSingour/OCPBUGS-57453
OCPBUGS-57453: fix(cpo): use Status.DNSZoneID before querying AWS Route53
2 parents 49cdfcf + 4cba875 commit bc3bda9

4 files changed

Lines changed: 364 additions & 15 deletions

File tree

control-plane-operator/controllers/awsprivatelink/awsprivatelink_controller.go

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ import (
5858
const (
5959
defaultResync = 10 * time.Hour
6060
externalPrivateServiceLabel = "hypershift.openshift.io/external-private-service"
61+
throttleRequeueDelay = 2 * time.Minute
6162
)
6263

6364
// PrivateServiceObserver watches a given Service type LB and reconciles
@@ -531,6 +532,10 @@ func (r *AWSEndpointServiceReconciler) Reconcile(ctx context.Context, req ctrl.R
531532
return ctrl.Result{}, err
532533
}
533534
}
535+
if isAWSThrottleError(err) {
536+
log.Info("AWS rate limit hit, backing off", "requeueAfter", throttleRequeueDelay)
537+
return ctrl.Result{RequeueAfter: throttleRequeueDelay}, nil
538+
}
534539
return ctrl.Result{}, err
535540
}
536541

@@ -552,6 +557,14 @@ func (r *AWSEndpointServiceReconciler) Reconcile(ctx context.Context, req ctrl.R
552557
return ctrl.Result{RequeueAfter: 5 * time.Minute}, nil
553558
}
554559

560+
func isAWSThrottleError(err error) bool {
561+
switch supportawsutil.AWSErrorCode(err) {
562+
case "Throttling", "ThrottlingException", "RequestLimitExceeded", "TooManyRequestsException":
563+
return true
564+
}
565+
return false
566+
}
567+
555568
func hasAWSConfig(platform *hyperv1.PlatformSpec) bool {
556569
return platform.Type == hyperv1.AWSPlatform && platform.AWS != nil && platform.AWS.CloudProviderConfig != nil &&
557570
platform.AWS.CloudProviderConfig.Subnet != nil && platform.AWS.CloudProviderConfig.Subnet.ID != nil
@@ -887,15 +900,19 @@ func (r *AWSEndpointServiceReconciler) reconcileEndpointDNSRecords(ctx context.C
887900

888901
zn := zoneName(hcp.Name)
889902
var zoneID string
890-
if r.awsClientBuilder.getLocalHostedZoneID() == "" {
903+
if localZoneID := r.awsClientBuilder.getLocalHostedZoneID(); localZoneID != "" {
904+
zoneID = localZoneID
905+
} else if awsEndpointService.Status.DNSZoneID != "" {
906+
zoneID = awsEndpointService.Status.DNSZoneID
907+
r.awsClientBuilder.setLocalHostedZoneID(zoneID)
908+
log.Info("using DNSZoneID from status", "zoneID", zoneID)
909+
} else {
891910
var err error
892911
zoneID, err = lookupZoneID(ctx, route53Client, zn)
893912
if err != nil {
894913
return nil, "", err
895914
}
896915
r.awsClientBuilder.setLocalHostedZoneID(zoneID)
897-
} else {
898-
zoneID = r.awsClientBuilder.getLocalHostedZoneID()
899916
}
900917

901918
var fqdns []string
@@ -904,6 +921,12 @@ func (r *AWSEndpointServiceReconciler) reconcileEndpointDNSRecords(ctx context.C
904921
fqdns = append(fqdns, fqdn)
905922
err := CreateRecord(ctx, route53Client, zoneID, fqdn, aws.ToString(endpointDNSEntries[0].DnsName), route53types.RRTypeCname)
906923
if err != nil {
924+
var noSuchZone *route53types.NoSuchHostedZone
925+
if errors.As(err, &noSuchZone) {
926+
r.awsClientBuilder.setLocalHostedZoneID("")
927+
awsEndpointService.Status.DNSZoneID = ""
928+
log.Info("hosted zone not found, clearing cached DNSZoneID", "zoneID", zoneID)
929+
}
907930
return nil, "", err
908931
}
909932
log.Info("DNS record created", "fqdn", fqdn)

control-plane-operator/controllers/awsprivatelink/awsprivatelink_controller_test.go

Lines changed: 322 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1765,3 +1765,325 @@ func TestExtractNLBName(t *testing.T) {
17651765
})
17661766
}
17671767
}
1768+
1769+
func TestReconcileEndpointDNSRecords(t *testing.T) {
1770+
testCases := []struct {
1771+
name string
1772+
awsEndpointSvc *hyperv1.AWSEndpointService
1773+
hcp *hyperv1.HostedControlPlane
1774+
setupMocks func(*gomock.Controller) (*MockawsClientProvider, *awsapi.MockROUTE53API)
1775+
expectedZoneID string
1776+
expectError bool
1777+
expectFQDNCount int
1778+
expectDNSZoneIDCleared bool
1779+
}{
1780+
{
1781+
name: "When Status.DNSZoneID is set and in-memory cache is empty, it should use the status value",
1782+
awsEndpointSvc: &hyperv1.AWSEndpointService{
1783+
ObjectMeta: metav1.ObjectMeta{
1784+
Name: "kube-apiserver-private",
1785+
Namespace: "clusters-test",
1786+
},
1787+
Status: hyperv1.AWSEndpointServiceStatus{
1788+
DNSZoneID: "Z1234567890",
1789+
},
1790+
},
1791+
hcp: &hyperv1.HostedControlPlane{
1792+
ObjectMeta: metav1.ObjectMeta{
1793+
Name: "test-hcp",
1794+
Namespace: "clusters-test",
1795+
},
1796+
},
1797+
setupMocks: func(mockCtrl *gomock.Controller) (*MockawsClientProvider, *awsapi.MockROUTE53API) {
1798+
mockBuilder := NewMockawsClientProvider(mockCtrl)
1799+
mockRoute53 := awsapi.NewMockROUTE53API(mockCtrl)
1800+
1801+
// In-memory cache is empty (simulates pod restart)
1802+
mockBuilder.EXPECT().getLocalHostedZoneID().Return("")
1803+
// Should populate the cache from status
1804+
mockBuilder.EXPECT().setLocalHostedZoneID("Z1234567890")
1805+
1806+
// Route53 ListHostedZones should NOT be called
1807+
// CreateRecord (ChangeResourceRecordSets) should be called with the status zone ID
1808+
mockRoute53.EXPECT().ChangeResourceRecordSets(gomock.Any(), gomock.Any(), gomock.Any()).
1809+
DoAndReturn(func(_ context.Context, input *route53sdk.ChangeResourceRecordSetsInput, _ ...func(*route53sdk.Options)) (*route53sdk.ChangeResourceRecordSetsOutput, error) {
1810+
if aws.ToString(input.HostedZoneId) != "Z1234567890" {
1811+
return nil, fmt.Errorf("unexpected zone ID: %s", aws.ToString(input.HostedZoneId))
1812+
}
1813+
return &route53sdk.ChangeResourceRecordSetsOutput{}, nil
1814+
})
1815+
1816+
return mockBuilder, mockRoute53
1817+
},
1818+
expectedZoneID: "Z1234567890",
1819+
expectError: false,
1820+
expectFQDNCount: 1,
1821+
},
1822+
{
1823+
name: "When both Status.DNSZoneID and in-memory cache are empty, it should call AWS",
1824+
awsEndpointSvc: &hyperv1.AWSEndpointService{
1825+
ObjectMeta: metav1.ObjectMeta{
1826+
Name: "kube-apiserver-private",
1827+
Namespace: "clusters-test",
1828+
},
1829+
Status: hyperv1.AWSEndpointServiceStatus{},
1830+
},
1831+
hcp: &hyperv1.HostedControlPlane{
1832+
ObjectMeta: metav1.ObjectMeta{
1833+
Name: "test-hcp",
1834+
Namespace: "clusters-test",
1835+
},
1836+
},
1837+
setupMocks: func(mockCtrl *gomock.Controller) (*MockawsClientProvider, *awsapi.MockROUTE53API) {
1838+
mockBuilder := NewMockawsClientProvider(mockCtrl)
1839+
mockRoute53 := awsapi.NewMockROUTE53API(mockCtrl)
1840+
1841+
// In-memory cache is empty
1842+
mockBuilder.EXPECT().getLocalHostedZoneID().Return("")
1843+
// Should call lookupZoneID -> ListHostedZones (paginator passes ctx, input, optFns...)
1844+
mockRoute53.EXPECT().ListHostedZones(gomock.Any(), gomock.Any(), gomock.Any()).Return(
1845+
&route53sdk.ListHostedZonesOutput{
1846+
HostedZones: []route53types.HostedZone{
1847+
{
1848+
Id: aws.String("/hostedzone/ZFROMAWS"),
1849+
Name: aws.String("test-hcp.hypershift.local."),
1850+
Config: &route53types.HostedZoneConfig{PrivateZone: true},
1851+
},
1852+
},
1853+
IsTruncated: false,
1854+
}, nil)
1855+
// Should cache the result from AWS
1856+
mockBuilder.EXPECT().setLocalHostedZoneID("ZFROMAWS")
1857+
1858+
// CreateRecord should be called with the zone from AWS
1859+
mockRoute53.EXPECT().ChangeResourceRecordSets(gomock.Any(), gomock.Any(), gomock.Any()).
1860+
DoAndReturn(func(_ context.Context, input *route53sdk.ChangeResourceRecordSetsInput, _ ...func(*route53sdk.Options)) (*route53sdk.ChangeResourceRecordSetsOutput, error) {
1861+
if aws.ToString(input.HostedZoneId) != "ZFROMAWS" {
1862+
return nil, fmt.Errorf("unexpected zone ID: %s", aws.ToString(input.HostedZoneId))
1863+
}
1864+
return &route53sdk.ChangeResourceRecordSetsOutput{}, nil
1865+
})
1866+
1867+
return mockBuilder, mockRoute53
1868+
},
1869+
expectedZoneID: "ZFROMAWS",
1870+
expectError: false,
1871+
expectFQDNCount: 1,
1872+
},
1873+
{
1874+
name: "When in-memory cache is populated, it should use the cache",
1875+
awsEndpointSvc: &hyperv1.AWSEndpointService{
1876+
ObjectMeta: metav1.ObjectMeta{
1877+
Name: "kube-apiserver-private",
1878+
Namespace: "clusters-test",
1879+
},
1880+
Status: hyperv1.AWSEndpointServiceStatus{
1881+
DNSZoneID: "ZOLDVALUE",
1882+
},
1883+
},
1884+
hcp: &hyperv1.HostedControlPlane{
1885+
ObjectMeta: metav1.ObjectMeta{
1886+
Name: "test-hcp",
1887+
Namespace: "clusters-test",
1888+
},
1889+
},
1890+
setupMocks: func(mockCtrl *gomock.Controller) (*MockawsClientProvider, *awsapi.MockROUTE53API) {
1891+
mockBuilder := NewMockawsClientProvider(mockCtrl)
1892+
mockRoute53 := awsapi.NewMockROUTE53API(mockCtrl)
1893+
1894+
// In-memory cache is populated
1895+
mockBuilder.EXPECT().getLocalHostedZoneID().Return("ZCACHED")
1896+
1897+
// Neither ListHostedZones nor setLocalHostedZoneID should be called
1898+
// CreateRecord should use the cached zone ID
1899+
mockRoute53.EXPECT().ChangeResourceRecordSets(gomock.Any(), gomock.Any(), gomock.Any()).
1900+
DoAndReturn(func(_ context.Context, input *route53sdk.ChangeResourceRecordSetsInput, _ ...func(*route53sdk.Options)) (*route53sdk.ChangeResourceRecordSetsOutput, error) {
1901+
if aws.ToString(input.HostedZoneId) != "ZCACHED" {
1902+
return nil, fmt.Errorf("unexpected zone ID: %s", aws.ToString(input.HostedZoneId))
1903+
}
1904+
return &route53sdk.ChangeResourceRecordSetsOutput{}, nil
1905+
})
1906+
1907+
return mockBuilder, mockRoute53
1908+
},
1909+
expectedZoneID: "ZCACHED",
1910+
expectError: false,
1911+
expectFQDNCount: 1,
1912+
},
1913+
{
1914+
name: "When lookupZoneID fails, it should return error",
1915+
awsEndpointSvc: &hyperv1.AWSEndpointService{
1916+
ObjectMeta: metav1.ObjectMeta{
1917+
Name: "kube-apiserver-private",
1918+
Namespace: "clusters-test",
1919+
},
1920+
Status: hyperv1.AWSEndpointServiceStatus{},
1921+
},
1922+
hcp: &hyperv1.HostedControlPlane{
1923+
ObjectMeta: metav1.ObjectMeta{
1924+
Name: "test-hcp",
1925+
Namespace: "clusters-test",
1926+
},
1927+
},
1928+
setupMocks: func(mockCtrl *gomock.Controller) (*MockawsClientProvider, *awsapi.MockROUTE53API) {
1929+
mockBuilder := NewMockawsClientProvider(mockCtrl)
1930+
mockRoute53 := awsapi.NewMockROUTE53API(mockCtrl)
1931+
1932+
mockBuilder.EXPECT().getLocalHostedZoneID().Return("")
1933+
mockRoute53.EXPECT().ListHostedZones(gomock.Any(), gomock.Any(), gomock.Any()).Return(
1934+
nil, fmt.Errorf("Route53 throttling: Rate exceeded"))
1935+
1936+
return mockBuilder, mockRoute53
1937+
},
1938+
expectError: true,
1939+
},
1940+
{
1941+
name: "When CreateRecord fails, it should return error",
1942+
awsEndpointSvc: &hyperv1.AWSEndpointService{
1943+
ObjectMeta: metav1.ObjectMeta{
1944+
Name: "kube-apiserver-private",
1945+
Namespace: "clusters-test",
1946+
},
1947+
Status: hyperv1.AWSEndpointServiceStatus{
1948+
DNSZoneID: "Z1234567890",
1949+
},
1950+
},
1951+
hcp: &hyperv1.HostedControlPlane{
1952+
ObjectMeta: metav1.ObjectMeta{
1953+
Name: "test-hcp",
1954+
Namespace: "clusters-test",
1955+
},
1956+
},
1957+
setupMocks: func(mockCtrl *gomock.Controller) (*MockawsClientProvider, *awsapi.MockROUTE53API) {
1958+
mockBuilder := NewMockawsClientProvider(mockCtrl)
1959+
mockRoute53 := awsapi.NewMockROUTE53API(mockCtrl)
1960+
1961+
mockBuilder.EXPECT().getLocalHostedZoneID().Return("")
1962+
mockBuilder.EXPECT().setLocalHostedZoneID("Z1234567890")
1963+
1964+
mockRoute53.EXPECT().ChangeResourceRecordSets(gomock.Any(), gomock.Any(), gomock.Any()).
1965+
Return(nil, &smithy.GenericAPIError{Code: "Throttling", Message: "Rate exceeded"})
1966+
1967+
return mockBuilder, mockRoute53
1968+
},
1969+
expectError: true,
1970+
},
1971+
{
1972+
name: "When CreateRecord fails with NoSuchHostedZone, it should clear both caches",
1973+
awsEndpointSvc: &hyperv1.AWSEndpointService{
1974+
ObjectMeta: metav1.ObjectMeta{
1975+
Name: "kube-apiserver-private",
1976+
Namespace: "clusters-test",
1977+
},
1978+
Status: hyperv1.AWSEndpointServiceStatus{
1979+
DNSZoneID: "ZSTALE",
1980+
},
1981+
},
1982+
hcp: &hyperv1.HostedControlPlane{
1983+
ObjectMeta: metav1.ObjectMeta{
1984+
Name: "test-hcp",
1985+
Namespace: "clusters-test",
1986+
},
1987+
},
1988+
setupMocks: func(mockCtrl *gomock.Controller) (*MockawsClientProvider, *awsapi.MockROUTE53API) {
1989+
mockBuilder := NewMockawsClientProvider(mockCtrl)
1990+
mockRoute53 := awsapi.NewMockROUTE53API(mockCtrl)
1991+
1992+
mockBuilder.EXPECT().getLocalHostedZoneID().Return("")
1993+
mockBuilder.EXPECT().setLocalHostedZoneID("ZSTALE")
1994+
mockBuilder.EXPECT().setLocalHostedZoneID("")
1995+
1996+
mockRoute53.EXPECT().ChangeResourceRecordSets(gomock.Any(), gomock.Any(), gomock.Any()).
1997+
Return(nil, &route53types.NoSuchHostedZone{Message: aws.String("Hosted zone ZSTALE not found")})
1998+
1999+
return mockBuilder, mockRoute53
2000+
},
2001+
expectError: true,
2002+
expectDNSZoneIDCleared: true,
2003+
},
2004+
}
2005+
2006+
for _, tc := range testCases {
2007+
t.Run(tc.name, func(t *testing.T) {
2008+
g := NewGomegaWithT(t)
2009+
2010+
mockCtrl := gomock.NewController(t)
2011+
mockBuilder, mockRoute53 := tc.setupMocks(mockCtrl)
2012+
2013+
reconciler := &AWSEndpointServiceReconciler{
2014+
awsClientBuilder: mockBuilder,
2015+
}
2016+
2017+
ctx := ctrl.LoggerInto(context.Background(), ctrl.Log.WithName("test"))
2018+
endpointDNSEntries := []ec2types.DnsEntry{
2019+
{DnsName: aws.String("vpce-abc.vpce-svc.us-east-1.vpce.amazonaws.com")},
2020+
}
2021+
2022+
fqdns, zoneID, err := reconciler.reconcileEndpointDNSRecords(ctx, mockRoute53, tc.awsEndpointSvc, tc.hcp, endpointDNSEntries, ctrl.Log.WithName("test"))
2023+
2024+
if tc.expectError {
2025+
g.Expect(err).To(HaveOccurred())
2026+
} else {
2027+
g.Expect(err).ToNot(HaveOccurred())
2028+
g.Expect(zoneID).To(Equal(tc.expectedZoneID))
2029+
g.Expect(fqdns).To(HaveLen(tc.expectFQDNCount))
2030+
}
2031+
if tc.expectDNSZoneIDCleared {
2032+
g.Expect(tc.awsEndpointSvc.Status.DNSZoneID).To(BeEmpty())
2033+
}
2034+
})
2035+
}
2036+
}
2037+
2038+
func TestIsAWSThrottleError(t *testing.T) {
2039+
tests := []struct {
2040+
name string
2041+
err error
2042+
expected bool
2043+
}{
2044+
{
2045+
name: "Throttling error should be detected",
2046+
err: &testAPIError{code: "Throttling"},
2047+
expected: true,
2048+
},
2049+
{
2050+
name: "ThrottlingException should be detected",
2051+
err: &testAPIError{code: "ThrottlingException"},
2052+
expected: true,
2053+
},
2054+
{
2055+
name: "RequestLimitExceeded should be detected",
2056+
err: &testAPIError{code: "RequestLimitExceeded"},
2057+
expected: true,
2058+
},
2059+
{
2060+
name: "TooManyRequestsException should be detected",
2061+
err: &testAPIError{code: "TooManyRequestsException"},
2062+
expected: true,
2063+
},
2064+
{
2065+
name: "Non-throttle AWS error should not be detected",
2066+
err: &testAPIError{code: "NoSuchHostedZone"},
2067+
expected: false,
2068+
},
2069+
{
2070+
name: "Non-AWS error should not be detected",
2071+
err: errors.New("network error"),
2072+
expected: false,
2073+
},
2074+
{
2075+
name: "Nil error should not be detected",
2076+
err: nil,
2077+
expected: false,
2078+
},
2079+
}
2080+
2081+
for _, tt := range tests {
2082+
t.Run(tt.name, func(t *testing.T) {
2083+
result := isAWSThrottleError(tt.err)
2084+
if result != tt.expected {
2085+
t.Errorf("isAWSThrottleError() = %v, want %v", result, tt.expected)
2086+
}
2087+
})
2088+
}
2089+
}

0 commit comments

Comments
 (0)