ClickHouse Analytics Database

ClickHouse เป็น Columnar Database สำหรับ OLAP ประมวลผล Queries เร็วมาก รองรับข้อมูลหลายพันล้าน Rows ใช้ Column-oriented Storage บีบอัดข้อมูลได้ดี เหมาะกับ Analytics

Home Lab ให้ทดลอง ClickHouse ที่บ้าน ไม่ต้องจ่ายค่า Cloud ใช้ PC เก่าหรือ Mini PC ติดตั้งผ่าน Docker เรียนรู้ Data Engineering จริง

ClickHouse Installation

# === ClickHouse Installation สำหรับ Home Lab ===

# 1. Docker Installation (แนะนำ)
docker run -d \
  --name clickhouse \
  --ulimit nofile=262144:262144 \
  -p 8123:8123 \
  -p 9000:9000 \
  -v clickhouse_data:/var/lib/clickhouse \
  -v clickhouse_logs:/var/log/clickhouse-server \
  clickhouse/clickhouse-server:latest

# 2. Docker Compose (Production-ready)
# docker-compose.yml
# version: '3.8'
# services:
#   clickhouse:
#     image: clickhouse/clickhouse-server:latest
#     ports:
#       - "8123:8123"   # HTTP
#       - "9000:9000"   # Native
#     volumes:
#       - ./data:/var/lib/clickhouse
#       - ./logs:/var/log/clickhouse-server
#       - ./config:/etc/clickhouse-server/config.d
#       - ./users:/etc/clickhouse-server/users.d
#     environment:
#       CLICKHOUSE_DB: analytics
#       CLICKHOUSE_USER: admin
#       CLICKHOUSE_PASSWORD: secure_password
#     ulimits:
#       nofile:
#         soft: 262144
#         hard: 262144
#     deploy:
#       resources:
#         limits:
#           memory: 8G
#
#   grafana:
#     image: grafana/grafana:latest
#     ports:
#       - "3000:3000"
#     environment:
#       GF_INSTALL_PLUGINS: grafana-clickhouse-datasource
#     volumes:
#       - grafana_data:/var/lib/grafana

# 3. Native Installation (Ubuntu)
sudo apt-get install -y apt-transport-https ca-certificates
sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 8919F6BD2B48D754
echo "deb https://packages.clickhouse.com/deb stable main" | sudo tee /etc/apt/sources.list.d/clickhouse.list
sudo apt-get update
sudo apt-get install -y clickhouse-server clickhouse-client

sudo systemctl start clickhouse-server
sudo systemctl enable clickhouse-server

# 4. ตรวจสอบ
clickhouse-client --query "SELECT version()"
curl http://localhost:8123/?query=SELECT+1

# 5. Performance Tuning (config.d/performance.xml)
# 
#   8000000000
#   8
#   100
#   5368709120
# 

echo "ClickHouse installed"
echo "  HTTP: http://localhost:8123"
echo "  Native: localhost:9000"
echo "  Client: clickhouse-client"

Table Design และ Data Loading

# === ClickHouse Table Design สำหรับ Analytics ===

# clickhouse-client
# หรือ curl -X POST http://localhost:8123 --data-binary @query.sql

# 1. Web Analytics Table
# CREATE TABLE web_events (
#     event_date Date,
#     event_time DateTime,
#     user_id UInt64,
#     session_id String,
#     event_type LowCardinality(String),
#     page_url String,
#     referrer String,
#     device LowCardinality(String),
#     browser LowCardinality(String),
#     country LowCardinality(String),
#     city String,
#     duration_ms UInt32,
#     is_bounce UInt8
# ) ENGINE = MergeTree()
# PARTITION BY toYYYYMM(event_date)
# ORDER BY (event_date, user_id, event_time)
# TTL event_date + INTERVAL 1 YEAR
# SETTINGS index_granularity = 8192;

# 2. Server Metrics Table
# CREATE TABLE server_metrics (
#     timestamp DateTime,
#     hostname LowCardinality(String),
#     metric_name LowCardinality(String),
#     value Float64,
#     tags Map(String, String)
# ) ENGINE = MergeTree()
# PARTITION BY toYYYYMMDD(timestamp)
# ORDER BY (hostname, metric_name, timestamp)
# TTL timestamp + INTERVAL 90 DAY;

# 3. Materialized View สำหรับ Real-time Aggregation
# CREATE MATERIALIZED VIEW hourly_page_views
# ENGINE = SummingMergeTree()
# PARTITION BY toYYYYMM(hour)
# ORDER BY (hour, page_url, country)
# AS SELECT
#     toStartOfHour(event_time) AS hour,
#     page_url,
#     country,
#     count() AS views,
#     uniq(user_id) AS unique_users,
#     avg(duration_ms) AS avg_duration
# FROM web_events
# GROUP BY hour, page_url, country;

# 4. Python Data Loading
# pip install clickhouse-connect pandas

import clickhouse_connect
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

def setup_clickhouse():
    """Setup ClickHouse Connection และ Tables"""
    client = clickhouse_connect.get_client(
        host='localhost', port=8123,
        username='default', password='',
    )

    # สร้าง Database
    client.command("CREATE DATABASE IF NOT EXISTS analytics")

    # สร้าง Table
    client.command("""
        CREATE TABLE IF NOT EXISTS analytics.web_events (
            event_date Date,
            event_time DateTime,
            user_id UInt64,
            session_id String,
            event_type LowCardinality(String),
            page_url String,
            country LowCardinality(String),
            duration_ms UInt32
        ) ENGINE = MergeTree()
        PARTITION BY toYYYYMM(event_date)
        ORDER BY (event_date, user_id, event_time)
    """)

    return client

def generate_sample_data(n=100000):
    """สร้างข้อมูลตัวอย่าง"""
    np.random.seed(42)
    pages = ["/home", "/products", "/cart", "/checkout", "/blog", "/about"]
    countries = ["TH", "US", "JP", "SG", "MY", "ID", "VN", "PH"]
    events = ["page_view", "click", "scroll", "purchase"]

    dates = [datetime.now() - timedelta(days=np.random.randint(0, 90)) for _ in range(n)]

    data = {
        "event_date": [d.date() for d in dates],
        "event_time": dates,
        "user_id": np.random.randint(1, 10000, n),
        "session_id": [f"s{i}" for i in np.random.randint(1, 50000, n)],
        "event_type": np.random.choice(events, n),
        "page_url": np.random.choice(pages, n),
        "country": np.random.choice(countries, n),
        "duration_ms": np.random.randint(100, 30000, n),
    }

    return pd.DataFrame(data)

# client = setup_clickhouse()
# df = generate_sample_data(1000000)
# client.insert_df("analytics.web_events", df)
# print(f"Inserted {len(df)} rows")

print("ClickHouse Tables:")
print("  web_events: Web analytics events")
print("  server_metrics: Server monitoring")
print("  hourly_page_views: Materialized View")

Analytics Queries

# === ClickHouse Analytics Queries ===

queries = {
    "Daily Active Users": """
        SELECT
            event_date,
            uniq(user_id) AS dau,
            count() AS total_events,
            avg(duration_ms) AS avg_duration
        FROM analytics.web_events
        WHERE event_date >= today() - 30
        GROUP BY event_date
        ORDER BY event_date
    """,

    "Top Pages by Views": """
        SELECT
            page_url,
            count() AS views,
            uniq(user_id) AS unique_users,
            avg(duration_ms) AS avg_duration,
            countIf(is_bounce = 1) * 100.0 / count() AS bounce_rate
        FROM analytics.web_events
        WHERE event_date >= today() - 7
        GROUP BY page_url
        ORDER BY views DESC
        LIMIT 10
    """,

    "User Funnel": """
        SELECT
            countIf(event_type = 'page_view') AS views,
            countIf(event_type = 'click') AS clicks,
            countIf(event_type = 'scroll') AS scrolls,
            countIf(event_type = 'purchase') AS purchases,
            round(countIf(event_type = 'purchase') * 100.0 /
                  countIf(event_type = 'page_view'), 2) AS conversion_rate
        FROM analytics.web_events
        WHERE event_date >= today() - 30
    """,

    "Country Distribution": """
        SELECT
            country,
            uniq(user_id) AS users,
            count() AS events,
            round(avg(duration_ms) / 1000, 1) AS avg_seconds
        FROM analytics.web_events
        WHERE event_date >= today() - 30
        GROUP BY country
        ORDER BY users DESC
    """,

    "Hourly Traffic Pattern": """
        SELECT
            toHour(event_time) AS hour,
            count() AS events,
            uniq(user_id) AS users
        FROM analytics.web_events
        WHERE event_date = today()
        GROUP BY hour
        ORDER BY hour
    """,

    "Retention (Day 1, 7, 30)": """
        WITH first_seen AS (
            SELECT user_id, min(event_date) AS first_date
            FROM analytics.web_events
            GROUP BY user_id
        )
        SELECT
            first_date,
            count() AS cohort_size,
            countIf(dateDiff('day', first_date, event_date) = 1) AS day1,
            countIf(dateDiff('day', first_date, event_date) = 7) AS day7,
            countIf(dateDiff('day', first_date, event_date) = 30) AS day30
        FROM analytics.web_events
        JOIN first_seen USING (user_id)
        WHERE first_date >= today() - 60
        GROUP BY first_date
        ORDER BY first_date
    """,
}

print("ClickHouse Analytics Queries:")
for name, query in queries.items():
    lines = len(query.strip().split("\n"))
    print(f"  {name} ({lines} lines)")

# รัน Query
# client = setup_clickhouse()
# for name, query in queries.items():
#     result = client.query(query)
#     print(f"\n{name}:")
#     for row in result.result_rows[:5]:
#         print(f"  {row}")

Best Practices

  • Partition by Date: ใช้ PARTITION BY toYYYYMM(date) สำหรับ Time Series Data
  • ORDER BY: เลือก ORDER BY ตาม Query Pattern ที่ใช้บ่อยที่สุด
  • LowCardinality: ใช้ LowCardinality สำหรับ Columns ที่มีค่าซ้ำกันมาก
  • Materialized Views: ใช้ Materialized Views สำหรับ Pre-aggregate Data
  • TTL: ตั้ง TTL ลบข้อมูลเก่าอัตโนมัติ ประหยัด Storage
  • Grafana Dashboard: ใช้ Grafana + ClickHouse Plugin สำหรับ Visualization

ClickHouse คืออะไร

Open-source Columnar Database สำหรับ OLAP พัฒนาโดย Yandex ประมวลผล Queries เร็วมาก รองรับหลายพันล้าน Rows Column-oriented Storage บีบอัดดี Log Analytics Time Series BI