Distributed Tracing DX

Distributed Tracing Developer Experience OpenTelemetry Jaeger Trace Context Span Auto-instrumentation Observability Microservices Debug Performance Latency

ToolTypeStorageUIเหมาะกับ
JaegerTracing BackendElasticsearch/CassandraBuilt-inProduction
TempoTracing BackendObject StorageGrafanaGrafana Stack
ZipkinTracing BackendMultipleBuilt-inSimple Setup
OpenTelemetrySDK + CollectorN/A (sends to backend)N/AInstrumentation
Datadog APMSaaSCloudCloudEnterprise

OpenTelemetry Setup

# === OpenTelemetry Auto-instrumentation ===

# pip install opentelemetry-api opentelemetry-sdk \
#   opentelemetry-exporter-otlp \
#   opentelemetry-instrumentation-flask \
#   opentelemetry-instrumentation-requests \
#   opentelemetry-instrumentation-sqlalchemy \
#   opentelemetry-instrumentation-redis

# Auto-instrumentation — Zero Code Change
# opentelemetry-instrument \
#   --traces_exporter otlp \
#   --metrics_exporter otlp \
#   --exporter_otlp_endpoint http://localhost:4317 \
#   --service_name my-api \
#   python app.py

# Manual Instrumentation — Custom Spans
# from opentelemetry import trace
# from opentelemetry.sdk.trace import TracerProvider
# from opentelemetry.sdk.trace.export import BatchSpanProcessor
# from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
# from opentelemetry.sdk.resources import Resource
#
# resource = Resource.create({"service.name": "order-service", "service.version": "1.0"})
# provider = TracerProvider(resource=resource)
# exporter = OTLPSpanExporter(endpoint="http://localhost:4317")
# provider.add_span_processor(BatchSpanProcessor(exporter))
# trace.set_tracer_provider(provider)
#
# tracer = trace.get_tracer("order-service")
#
# @app.route("/orders", methods=["POST"])
# def create_order():
#     with tracer.start_as_current_span("create_order") as span:
#         span.set_attribute("order.customer_id", customer_id)
#         span.set_attribute("order.total", total)
#
#         with tracer.start_as_current_span("validate_inventory"):
#             validate_inventory(items)
#
#         with tracer.start_as_current_span("process_payment"):
#             process_payment(total)
#
#         with tracer.start_as_current_span("send_confirmation"):
#             send_email(customer_email)
#
#         span.set_status(trace.StatusCode.OK)
#         return jsonify({"order_id": order_id})

# Docker Compose — Dev Environment
# version: '3.8'
# services:
#   jaeger:
#     image: jaegertracing/all-in-one:latest
#     ports:
#       - "16686:16686"  # UI
#       - "4317:4317"    # OTLP gRPC
#       - "4318:4318"    # OTLP HTTP
#     environment:
#       - COLLECTOR_OTLP_ENABLED=true

from dataclasses import dataclass

@dataclass
class InstrumentationLib:
    library: str
    auto: bool
    spans_created: str
    attributes: str

libs = [
    InstrumentationLib("Flask/FastAPI", True, "HTTP request spans", "method path status_code"),
    InstrumentationLib("requests/httpx", True, "Outgoing HTTP spans", "url method status"),
    InstrumentationLib("SQLAlchemy", True, "Database query spans", "db.system db.statement"),
    InstrumentationLib("Redis", True, "Redis command spans", "db.system db.statement"),
    InstrumentationLib("Celery", True, "Task execution spans", "task.name task.id"),
    InstrumentationLib("gRPC", True, "RPC call spans", "rpc.method rpc.service"),
]

print("=== Auto-instrumentation Libraries ===")
for l in libs:
    auto_tag = "Auto" if l.auto else "Manual"
    print(f"  [{auto_tag}] {l.library}")
    print(f"    Spans: {l.spans_created}")
    print(f"    Attributes: {l.attributes}")

Developer Workflow

# === DX-focused Tracing Workflow ===

# Local Development Setup
# 1. docker compose up jaeger
# 2. Set env: OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
# 3. Run service with auto-instrumentation
# 4. Open Jaeger UI: http://localhost:16686
# 5. Make request → See trace immediately

# Trace-based Debugging
# Instead of: grep -r "order-123" logs/*.log
# Now: Search Jaeger by trace_id or order_id tag
# See: Full request flow across all services
#   → API Gateway (2ms)
#     → Auth Service (5ms)
#     → Order Service (150ms)
#       → Inventory Check (30ms)
#       → Payment Service (100ms) ← SLOW!
#         → Stripe API (95ms)
#       → Email Service (15ms)

@dataclass
class DXImprovement:
    before: str
    after: str
    time_saved: str
    developer_impact: str

improvements = [
    DXImprovement(
        "grep logs across 5 services",
        "Search by trace_id in Jaeger",
        "30min → 2min",
        "Debug ง่ายขึ้นมาก"
    ),
    DXImprovement(
        "Guess which service is slow",
        "See latency breakdown in trace",
        "1hr → 5min",
        "หา Bottleneck ทันที"
    ),
    DXImprovement(
        "Add print/log statements",
        "Auto-instrumentation ไม่ต้องแก้ Code",
        "Setup: 2hr → 10min",
        "ไม่ต้องแก้ Code เลย"
    ),
    DXImprovement(
        "Ask team which service failed",
        "See error span with stack trace",
        "Variable → 1min",
        "Self-service debugging"
    ),
    DXImprovement(
        "No visibility in local dev",
        "Jaeger Docker for local traces",
        "N/A → Instant",
        "เห็น Trace ตั้งแต่ Dev"
    ),
]

print("=== DX Improvements ===")
for d in improvements:
    print(f"  Before: {d.before}")
    print(f"  After:  {d.after}")
    print(f"  Time Saved: {d.time_saved}")
    print(f"  Impact: {d.developer_impact}")
    print()

Production Setup

# === Production Tracing Architecture ===

# OpenTelemetry Collector Config
# # otel-collector-config.yaml
# receivers:
#   otlp:
#     protocols:
#       grpc:
#         endpoint: 0.0.0.0:4317
#       http:
#         endpoint: 0.0.0.0:4318
#
# processors:
#   batch:
#     timeout: 5s
#     send_batch_size: 1000
#   tail_sampling:
#     decision_wait: 10s
#     policies:
#       - name: errors
#         type: status_code
#         status_code: {status_codes: [ERROR]}
#       - name: slow
#         type: latency
#         latency: {threshold_ms: 500}
#       - name: sample
#         type: probabilistic
#         probabilistic: {sampling_percentage: 10}
#
# exporters:
#   otlp/jaeger:
#     endpoint: jaeger-collector:4317
#     tls:
#       insecure: true
#
# service:
#   pipelines:
#     traces:
#       receivers: [otlp]
#       processors: [batch, tail_sampling]
#       exporters: [otlp/jaeger]

@dataclass
class TracingMetric:
    metric: str
    value: str
    target: str
    status: str

prod_metrics = [
    TracingMetric("Trace Coverage", "95% of services", "100%", "Good"),
    TracingMetric("Avg Spans/Trace", "12", "<20", "OK"),
    TracingMetric("Sampling Rate", "10% + 100% errors", "Adaptive", "OK"),
    TracingMetric("Collector Latency", "3ms", "<10ms", "Good"),
    TracingMetric("Storage (30 days)", "850GB", "<1TB", "OK"),
    TracingMetric("MTTR (Mean Time to Resolve)", "15min", "<30min", "Good"),
    TracingMetric("Developer Adoption", "85%", ">90%", "Improving"),
]

print("Production Tracing Metrics:")
for m in prod_metrics:
    print(f"  [{m.status}] {m.metric}: {m.value} (Target: {m.target})")

เคล็ดลับ

  • Auto: เริ่มจาก Auto-instrumentation ก่อน ไม่ต้องแก้ Code
  • Local: ใช้ Jaeger Docker ดู Trace ตั้งแต่ตอน Dev
  • Sampling: ใช้ Tail Sampling เก็บ 100% Errors + Sample ปกติ
  • Context: ใส่ Attribute สำคัญ user_id order_id ใน Span
  • Correlate: เชื่อม Trace ID กับ Log และ Metrics

Distributed Tracing คืออะไร

ติดตาม Request หลาย Service Trace ID Span Operation Duration Status Error Parent-Child Tree Debug Performance Bottleneck