AZ失敗実験のセットアップ
インスタンスのスケーリング
アベイラビリティゾーン(AZ)障害の影響を完全に確認するために、まずAZごとに2つのインスタンスにスケールアップし、ポッドの数も9つに増やしましょう:
~$export ASG_NAME=$(aws autoscaling describe-auto-scaling-groups --query "AutoScalingGroups[? Tags[? (Key=='eks:cluster-name') && Value=='eks-workshop']].AutoScalingGroupName" --output text)
~$aws autoscaling update-auto-scaling-group \
--auto-scaling-group-name $ASG_NAME \
--desired-capacity 6 \
--min-size 6 \
--max-size 6
~$sleep 60
~$kubectl scale deployment ui --replicas=9 -n ui
~$timeout 10s ~/$SCRIPT_DIR/get-pods-by-az.sh | head -n 30
------us-west-2a------
ip-10-42-100-4.us-west-2.compute.internal:
ui-6dfb84cf67-xbbj4 0/1 ContainerCreating 0 1s
ip-10-42-106-250.us-west-2.compute.internal:
ui-6dfb84cf67-4fjhh 1/1 Running 0 5m20s
ui-6dfb84cf67-gkrtn 1/1 Running 0 5m19s
------us-west-2b------
ip-10-42-139-198.us-west-2.compute.internal:
ui-6dfb84cf67-7rfkf 0/1 ContainerCreating 0 4s
ip-10-42-141-133.us-west-2.compute.internal:
ui-6dfb84cf67-7qnkz 1/1 Running 0 5m23s
ui-6dfb84cf67-n58b9 1/1 Running 0 5m23s
------us-west-2c------
ip-10-42-175-140.us-west-2.compute.internal:
ui-6dfb84cf67-8xfk8 0/1 ContainerCreating 0 8s
ui-6dfb84cf67-s55nb 0/1 ContainerCreating 0 8s
ip-10-42-179-59.us-west-2.compute.internal:
ui-6dfb84cf67-lvdc2 1/1 Running 0 5m26s
合成カナリアの設定
実験を始める前に、ハートビートモニタリング用の合成カナリアを設定しましょう:
-
まず、カナリアのアーティファクト用にS3バケットを作成します:
~$export BUCKET_NAME="eks-workshop-canary-artifacts-$(date +%s)"~$aws s3 mb s3://$BUCKET_NAME --region $AWS_REGIONmake_bucket: eks-workshop-canary-artifacts-1724131402
-
ブループリントを作成します:
~/environment/eks-workshop/modules/observability/resiliency/scripts/create-blueprint.sh#!/bin/bash
# Get Ingress URL
INGRESS_URL=$(kubectl get ingress -n ui -o jsonpath='{.items[0].status.loadBalancer.ingress[0].hostname}')
# Create the required directory structure
mkdir -p nodejs/node_modules
# Create the Node.js canary script with heartbeat blueprint
cat << EOF > nodejs/node_modules/canary.js
const { URL } = require('url');
const synthetics = require('Synthetics');
const log = require('SyntheticsLogger');
const syntheticsConfiguration = synthetics.getConfiguration();
const syntheticsLogHelper = require('SyntheticsLogHelper');
const loadBlueprint = async function () {
const urls = ['http://${INGRESS_URL}'];
// Set screenshot option
const takeScreenshot = true;
// Configure synthetics settings
syntheticsConfiguration.disableStepScreenshots();
syntheticsConfiguration.setConfig({
continueOnStepFailure: true,
includeRequestHeaders: true,
includeResponseHeaders: true,
restrictedHeaders: [],
restrictedUrlParameters: []
});
let page = await synthetics.getPage();
for (const url of urls) {
await loadUrl(page, url, takeScreenshot);
}
};
// Reset the page in-between
const resetPage = async function(page) {
try {
await page.goto('about:blank', {waitUntil: ['load', 'networkidle0'], timeout: 30000});
} catch (e) {
synthetics.addExecutionError('Unable to open a blank page. ', e);
}
};
const loadUrl = async function (page, url, takeScreenshot) {
let stepName = null;
let domcontentloaded = false;
try {
stepName = new URL(url).hostname;
} catch (e) {
const errorString = \`Error parsing url: \${url}. \${e}\`;
log.error(errorString);
throw e;
}
await synthetics.executeStep(stepName, async function () {
const sanitizedUrl = syntheticsLogHelper.getSanitizedUrl(url);
const response = await page.goto(url, { waitUntil: ['domcontentloaded'], timeout: 30000});
if (response) {
domcontentloaded = true;
const status = response.status();
const statusText = response.statusText();
logResponseString = \`Response from url: \${sanitizedUrl} Status: \${status} Status Text: \${statusText}\`;
if (response.status() < 200 || response.status() > 299) {
throw new Error(\`Failed to load url: \${sanitizedUrl} \${response.status()} \${response.statusText()}\`);
}
} else {
const logNoResponseString = \`No response returned for url: \${sanitizedUrl}\`;
log.error(logNoResponseString);
throw new Error(logNoResponseString);
}
});
// Wait for 15 seconds to let page load fully before taking screenshot.
if (domcontentloaded && takeScreenshot) {
await new Promise(r => setTimeout(r, 15000));
await synthetics.takeScreenshot(stepName, 'loaded');
}
// Reset page
await resetPage(page);
};
exports.handler = async () => {
return await loadBlueprint();
};
EOF
# Zip the Node.js script
python3 - << EOL
import zipfile
with zipfile.ZipFile('canary.zip', 'w') as zipf:
zipf.write('nodejs/node_modules/canary.js', arcname='nodejs/node_modules/canary.js')
EOL
# Ensure BUCKET_NAME is set
if [ -z "$BUCKET_NAME" ]; then
echo "Error: BUCKET_NAME environment variable is not set."
exit 1
fi
# Upload the zipped canary script to S3
aws s3 cp canary.zip "s3://${BUCKET_NAME}/canary-scripts/canary.zip"
echo "Canary script has been zipped and uploaded to s3://${BUCKET_NAME}/canary-scripts/canary.zip"
echo "The script is configured to check the URL: http://${INGRESS_URL}"このカナリアブループリントをバケットに配置します:
~$~/$SCRIPT_DIR/create-blueprint.shupload: ./canary.zip to s3://eks-workshop-canary-artifacts-1724131402/canary-scripts/canary.zip
Canary script has been zipped and uploaded to s3://eks-workshop-canary-artifacts-1724131402/canary-scripts/canary.zip
The script is configured to check the URL: http://k8s-ui-ui-5ddc3ba496-721427594.us-west-2.elb.amazonaws.com
-
Cloudwatchアラームを備えた合成カナリアを作成します:
~$aws synthetics create-canary \--name eks-workshop-canary \--artifact-s3-location "s3://$BUCKET_NAME/canary-artifacts/" \--execution-role-arn $CANARY_ROLE_ARN \--runtime-version syn-nodejs-puppeteer-9.0 \--schedule "Expression=rate(1 minute)" \--code "Handler=canary.handler,S3Bucket=$BUCKET_NAME,S3Key=canary-scripts/canary.zip" \--region $AWS_REGION~$sleep 40~$aws synthetics describe-canaries --name eks-workshop-canary --region $AWS_REGION~$aws synthetics start-canary --name eks-workshop-canary --region $AWS_REGION~$aws cloudwatch put-metric-alarm \--alarm-name "eks-workshop-canary-alarm" \--metric-name SuccessPercent \--namespace CloudWatchSynthetics \--statistic Average \--period 60 \--threshold 95 \--comparison-operator LessThanThreshold \--dimensions Name=CanaryName,Value=eks-workshop-canary \--evaluation-periods 1 \--alarm-description "Alarm when Canary success rate drops below 95%" \--unit Percent \--region $AWS_REGION
これにより、1分ごとにアプリケーションの健全性をチェックするカナリアと、成功率が95%を下回ると発報するCloudWatchアラームが設定されます。
これらの手順を完了すると、アプリケーションはAZ全体に2つのインスタンスにスケーリングされ、今後のAZ障害シミュレーション実験に必要なモニタリングが設定されました。