Use Nginx to load balance LLM services

Written by
Caleb Hayes
Updated on:July-14th-2025
Recommendation

Use Nginx to load balance the LLM service and achieve performance optimization in high-concurrency scenarios.

Core content:
1. Introduction to Nginx as a high-performance Web server and its characteristics
2. Nginx installation steps and configuration file editing
3. Introduction to load balancing algorithm and test load balancing chat service example

Yang Fangxian
Founder of 53AI/Most Valuable Expert of Tencent Cloud (TVP)
    Imagine there are many algorithm service instances. Load imbalance will cause:


    • Some nodes are under heavy computing pressure, and user delays are increased
    • Some nodes are idle, wasting resources

    Requests need to be distributed to different nodes for processing so that the load on each node is at an appropriate level. This is load balancing.

    1. Introduction

    nginx It is an open source, high-performance web server that is also widely used as a reverse proxy server, load balancer, and HTTP cache. Its design goal is to solve the performance bottleneck of traditional servers (such as Apache) in high-concurrency scenarios, and it has become one of the most popular web servers in the world.

    Features:

    • High performance : Based on an event-driven asynchronous architecture, a single machine supports tens of thousands of concurrent connections.

    • Lightweight : low memory usage and simple configuration.

    • Flexible algorithms : Round Robin, Weighted Round Robin, IP Hash, Least Connections, etc.

    2. Nginx Installation

    Take Ubuntu as an example

    sudo apt install nginx

    Editing the Configuration File /etc/nginx/nginx.conf, add the following to http

    http {
        upstream backend {
            least_conn;   # Balancing algorithm
            server 127.0.0.1:8001;   # Backend service 1
            server 127.0.0.1:8002;   # Backend service 2
        }
        server {
            listen 80;
            location / {
                proxy_pass http://backend;
            }
        }
        
        log_format upstream_log  '$remote_addr - $remote_user [$time_local] "$request" '
                               '$status $body_bytes_sent "$http_referer" '
                               '"$http_user_agent" "$http_x_forwarded_for" '
                               'to: $upstream_addr' ;

        access_log /var/ log /nginx/upstream.log upstream_log;
    }

    The complete configuration is as follows:

    # cat /etc/nginx/nginx.conf
    user www-data;
    worker_processes auto;
    pid /run/nginx.pid;
    include /etc/nginx/modules-enabled/*.conf;

    events {
            worker_connections 768;
            # multi_accept on;
    }

    http {
        ##
        # Basic Settings
        ##

        sendfile on;
        tcp_nopush on;
        types_hash_max_size 2048;
        # server_tokens off;

        # server_names_hash_bucket_size 64;
        # server_name_in_redirect off;

        include /etc/nginx/mime.types;
        default_type application/octet-stream;

        ##
        # Logging Settings
        ##

        log_format upstream_log  '$remote_addr:$remote_port $request_uri - $remote_user [$time_local] "$request" '
                               '$status $body_bytes_sent "$http_referer" '
                               '"$http_user_agent" "$http_x_forwarded_for" '
                               'to: $upstream_addr' ;

        access_log /var/ log /nginx/access.log upstream_log;
        error_log /var/ log /nginx/error.log;

        ##
        # SSL Settings
        ##

        ssl_protocols TLSv1 TLSv1.1 TLSv1.2 TLSv1.3;  # Dropping SSLv3, ref: POODLE
        ssl_prefer_server_ciphers on;

        ##
        # Gzip Settings
        ##

        gzip on;
        # gzip_vary on;
        # gzip_proxied any;
        # gzip_comp_level 6;
        # gzip_buffers 16 8k;
        # gzip_http_version 1.1;
        # gzip_types text/plain text/css application/json application/javascript text/xml application/xml application/xml+rss text/javascript;

        ##
        # Upstream Servers
        ##

        upstream backend {
            least_conn;
            server 127.0.0.1:8001;   # Backend service 1
            server 127.0.0.1:8002;   # Backend service 2
        }

        ##
        # Server Blocks
        ##

        server {
            listen 80;
            server_name localhost;
            location / {
                proxy_pass http://backend;
                proxy_set_header Host  $host ;
                proxy_set_header X-Real-IP  $remote_addr ;
                proxy_set_header X-Forwarded-For  $proxy_add_x_forwarded_for ;
                proxy_set_header X-Forwarded-Proto  $scheme ;
                #proxy_http_version 1.1; # Ensure HTTP/1.1 is used <button class="citation-flag" data-index="7">
                #proxy_set_header Connection '';
            }
        }
        add_header X-Upstream  $upstream_addr  always;   # Expose the backend address in the response header
        ##
        # Virtual Host Configs
        ##

        include /etc/nginx/conf.d/*.conf;
        include /etc/nginx/sites-enabled/*;
    }

    #mail {
    # # See sample authentication script at:
    # # http://wiki.nginx.org/ImapAuthenticateWithApachePhpScript
    #
    # # auth_http localhost/auth.php;
    # # pop3_capabilities "TOP" "USER";
    # # imap_capabilities "IMAP4rev1" "UIDPLUS";
    #
    # server {
    # listen localhost:110;
    # protocol pop3;
    # proxy on;
    # }
    #
    # server {
    # listen localhost:143;
    # protocol imap;
    # proxy on;
    # }
    #}

    After writing the configuration,systemctl reload nginx Restart,nginx -t Confirm configuration ok

    3. Balancing algorithm

    In the above configuration least_conn It is a balancing algorithm, and there are multiple algorithms to choose from

    • Round Robin Default algorithm, no explicit declaration required Working principle: Distribute requests to backend servers one by one in order Weighted version: Adjust the distribution ratio through the weight parameter
    upstream backend {
        server 127.0.0.1:8001 weight=3;   # 60% of requests
        server 127.0.0.1:8002 weight=2;   # 40% of requests
    }
    • Least Connections Syntax: least_conn Working principle: Prioritize sending requests to the backend with the least number of connections Applicable scenarios: When the number of backend servers processing is very different
    upstream backend {
        least_conn;
        server 127.0.0.1:8001;
        server 127.0.0.1:8002;
    }
    • IP Hash Syntax: ip_hash Working principle: Fixed allocation of backend servers based on the hash value of the client IP Applicable scenarios: Session persistence is required
    upstream backend {
        ip_hash;
        server 127.0.0.1:8001;
        server 127.0.0.1:8002;
    }
    • Generic Hash Syntax: hash key [consistent] Working principle: Calculate hash value based on custom keys (such as URI, request parameters) consistent: Enable consistent hashing to reduce the impact of backend additions and subtractions

    Consistent hashing effectively solves the data migration and load imbalance problems of traditional hashing algorithms in dynamic environments through hash rings and virtual node mechanisms. Its core is to reduce the impact of node changes and use virtual nodes to achieve balanced data distribution. It is a key technology for achieving high availability and scalability in distributed systems.

    upstream backend {
        hash $request_uri  consistent;   # Assign by request URI 
        server 127.0.0.1:8001;
        server 127.0.0.1:8002;
    }

    All algorithms can be used with weights:

    server 127.0.0.1:8001 weight=5 max_fails=3 fail_timeout=30s;

    4. Test load balancing

    Open two http services (port 8001, 8002) using python script server.py start up

    # server.py
    from  http.server  import  BaseHTTPRequestHandler, HTTPServer
    import  socket

    class DebugRequestHandler (BaseHTTPRequestHandler) : 
        def do_GET (self) : 
            # Print client information
            client_ip, client_port = self.client_address
            print( f"\n--- New request @ {self.server.server_port}  ---" )
            print( f"[client]  {client_ip} : {client_port} " )
            print( f"[method]  {self.command} " )
            print( f"[path]  {self.path} " )
            print( f"[request header]  {self.headers} " )

            # Return the response 
            self.send_response( 200 )
            self.send_header( "Content-type""text/plain" )
            self.end_headers()
            self.wfile.write( f"Response from:  {self.server.server_port} " .encode())

        def do_POST (self) : 
            # Handling POST requests 
            content_length = int(self.headers[ "Content-Length" ])
            post_data = self.rfile.read(content_length).decode( "utf-8" )
            
            print( f"\n--- POST request @ {self.server.server_port}  ---" )
            print( f"[data]  {post_data} " )
            self.do_GET()   # reuse GET response logic

    def run (server_class=HTTPServer, handler_class=DebugRequestHandler, port= 8001 ) : 
        server_address = ( "0.0.0.0" , port)   # Bind to all interfaces
        httpd = server_class(server_address, handler_class)
        print( f"Service started at 0.0.0.0: {port} ..." )
        httpd.serve_forever()

    if  __name__ ==  "__main__" :
        # Start two service instances (on ports 8001 and 8002 respectively)
        import  threading
        threading.Thread(target=run, kwargs={ "port"8001 }).start()
        threading.Thread(target=run, kwargs={ "port"8002 }).start()

    python3 server.py

    Stress Testing

    apt install apache2-utils
    ab -n 1000 -c 10 http://localhost/

    ab It is the abbreviation of Apache Bench, which is a performance testing tool in the Apache HTTP server project, used to initiate stress testing on the Web server.

    Command breakdown

    1. **ab**: Tool name (Apache Bench).
    2. **-n 1000**: The total number of requests is 1000 (-n express number of requests).
    3. **-c 10**: The number of concurrent users is 10 (-c express concurrency, which is the number of requests sent simultaneously).
    4. **http://localhost/**: Target test address (can be a local service or a remote URL).

    Simulate  10 concurrent users  accessing at the same time http://localhost/, each user sends requests continuously until the total number of requests reaches  1000. This command can be used to test:

    • The throughput of the server  (how many requests per second it handles).
    • The average response time for requests  .
    • The concurrent processing capability of the server  .
    • Whether an error occurred (such as timeout, 5xx error, etc.).

    Example Output Interpretation

    After running the command, the output example is as follows:

    This is ApacheBench, Version 2.3 <$Revision: 1879490 $>
    Copyright 1996 Adam Twiss, Zeus Technology Ltd, http://www.zeustech.net/
    Licensed to The Apache Software Foundation, http://www.apache.org/

    Benchmarking localhost (be patient)
    Completed 100 requests
    Completed 200 requests
    Completed 300 requests
    Completed 400 requests
    Completed 500 requests
    Completed 600 requests
    Completed 700 requests
    Completed 800 requests
    Completed 900 requests
    Completed 1000 requests
    Finished 1000 requests


    Server Software: nginx/1.18.0
    Server Hostname: localhost
    Server Port: 80

    Document Path: /
    Document Length: 18 bytes

    Concurrency Level: 10
    Time taken for tests: 1.201 seconds
    Complete requests: 1000
    Failed requests: 0
    Total transferred: 178000 bytes
    HTML transferred: 18000 bytes
    Requests per second: 832.30 [#/sec] (mean)
    Time per request: 12.015 [ms] (mean)
    Time per request: 1.201 [ms] (mean, across all concurrent requests)
    Transfer rate: 144.68 [Kbytes/sec] received

    Connection Times (ms)
                  min mean[+/-sd] median max
    Connect: 0 0 0.0 0 1
    Processing: 2 12 1.9 12 26
    Waiting: 1 12 1.9 12 25
    Total: 2 12 1.9 12 26

    Percentage of the requests served within a certain time (ms)
      50% 12
      66% 12
      75% 13
      80% 13
      90% 14
      95% 15
      98% 16
      99% 18
     100% 26 (longest request)

    Key Metrics

    1. Requests per second :813.01 [#/sec]
      It means that the server can handle  832 requests per second (the higher the throughput, the better the performance).

    2. Time per request :

    • 12.015 [ms] (mean): Average time taken for each request (from the user's perspective).
    • 1.201 [ms] (mean, across all concurrent requests): The average time it takes the server to process each request.
    • Connection Times : Displays the distribution of connection, processing, and waiting times (in milliseconds).

    • Percentage served within a certain time : The distribution of response time (e.g. 90% of requests are completed within 14ms).

    • Server log output: Let's count the number of requests to the two servers:

    tail -f /var/ log /nginx/access.log
    127.0.0.1:48878 / - - [02/Mar/2025:21:10:11 +0800]  "GET / HTTP/1.0"  200 18  "-" "ApacheBench/2.3" "-"  to: 127.0.0.1:8001  
    127.0.0.1:48882 / - - [02/Mar/2025:21:10:11 +0800]  "GET / HTTP/1.0"  200 18  "-" "ApacheBench/2.3" "-"  to: 127.0.0.1:8002  
    127.0.0.1:48892 / - - [02/Mar/2025:21:10:11 +0800]  "GET / HTTP/1.0"  200 18  "-" "ApacheBench/2.3" "-"  to: 127.0.0.1:8001  
    127.0.0.1:48904 / - - [02/Mar/2025:21:10:11 +0800]  "GET / HTTP/1.0"  200 18  "-" "ApacheBench/2.3" "-"  to: 127.0.0.1:8002  
    root@MichaelMing:~ # cat /var/log/nginx/access.log | tail -n 1000 | grep ":8001" | wc -l
    506
    root@MichaelMing:~ # cat /var/log/nginx/access.log | tail -n 1000 | grep ":8002" | wc -l
    494

    or

    root@MichaelMing:~ # awk '{print $NF}' /var/log/nginx/access.log | sort | uniq -c
        300  "ApacheBench/2.3"
          3  "curl/7.81.0"
      93576 -
        984 127.0.0.1:8001
       1018 127.0.0.1:8002

    We checked the two port numbers and found that the number of requests for the services on the two ports was basically balanced (506/494). Nginx played a role in load balancing.

    5. Chat service example

    Assumptions MichaelAIThere are two API interfaces chat,completions

    from http.server import BaseHTTPRequestHandler, HTTPServer
    import json
    from urllib.parse import urlparse, parse_qs
    from datetime import datetime

    class MultiAPIHandler(BaseHTTPRequestHandler):
        def send_json_response(self, data, status=200):
            self.send_response(status)
            self.send_header( "Content-type""application/json" )
            self.end_headers()
            self.wfile.write(json.dumps(data).encode( "utf-8" ))
        
        def do_POST(self):
            # Unified ingress routing distribution
            parsed_path = urlparse(self.path)
            path = parsed_path.path
            
            try:
                content_length = int(self.headers[ "Content-Length" ])
                post_data = self.rfile.read(content_length).decode( "utf-8" )
                
                print (f "\n--- {path} request @{self.server.server_port} ---" )
                print (f "[Client] {self.client_address[0]}:{self.client_address[1]}" )
                print (f "[data] {post_data}" )
                
                if  path ==  "/chat" :
                    self.handle_chat(post_data)
                elif  path ==  "/completions" :
                    self.handle_completions(post_data)
                else :
                    self.send_error(404,  "API not found" )
                    
            except Exception as e:
                self.send_json_response({
                    "error" : str(e),
                    "server" : self.server.server_port,
                    "timestamp" : datetime.now().isoformat()
                }, 400)

        def handle_chat(self, data):
            # Simulate dialogue interface processing
            try:
                input_data = json.loads(data)
                message = input_data.get( "message""" )
                
                response = {
                    "original" : message,
                    "response" : f "Processed by chat API: {message.upper()}" ,
                    "server" : self.server.server_port,
                    "timestamp" : datetime.now().isoformat()
                }
                self.send_json_response(response)
                
            except json.JSONDecodeError:
                raise ValueError( "Invalid JSON format" )

        def handle_completions(self, data):
            # Text completion interface
            response = {
                "completions" : [
                    { "text" : f "Completion 1: {data[:5]}...""index" : 0},
                    { "text" : f "Completion 2: {data[-5:]}...""index" : 1}
                ],
                "server" : self.server.server_port,
                "timestamp" : datetime.now().isoformat()
            }
            self.send_json_response(response)

    def run(server_class=HTTPServer, handler_class=MultiAPIHandler, port=8001):
        server_address = ( "0.0.0.0" , port)
        httpd = server_class(server_address, handler_class)
        print (f "Multiple API services started at 0.0.0.0:{port}..." )
        httpd.serve_forever()

    if  __name__ ==  "__main__" :
        import threading
        # Start two service instances
        threading.Thread(target=run, kwargs={ "port" : 8001}).start()
        threading.Thread(target=run, kwargs={ "port" : 8002}).start()
    • Testing the chat API

    chat_data.json example: {"message": "Hello, this is a test message"}

    ab -n 100 -c 10 -T  "application/json"  -p json_data \
    http://localhost:8001/chat

    Server log

    --- /chat request@8001 ---
    [Client] 127.0.0.1:54724
    [data] { "message""Hello, this is a test message" }

    127.0.0.1 - - [02/Mar/2025 22:41:22]  "POST /chat HTTP/1.0"  200 -
    • Testing the completions API
    text.txt example:
    Hello, I am the intelligent assistant developed by Michael Amin!
    ab -n 10000 -c 100 -T  "text/plain"  -p text.txt \
    http://localhost:8002/completions

    Server log

    --- /completions request@8002 ---
    [Client] 127.0.0.1:47246
    [Data] Hello, I am the intelligent assistant developed by Michael Amin!

    127.0.0.1 - - [02/Mar/2025 22:49:26]  "POST /completions HTTP/1.0"  200 
    • Test two interfaces simultaneously
    # Stress test two interfaces at the same time
    echo "chat interface stress test results:"  && \ 
    ab -n 500 -c 50 -T  "application/json"  -p json_data http://localhost:80/chat && \
    echo "Completions interface stress test results:"  && \ 
    ab -n 500 -c 50 -T  "text/plain"  -p text.txt http://localhost:80/completions

    Completions interface call distribution: the two machines are basically close

    root@MichaelMing:~ # cat /var/log/nginx/access.log | tail -n 20000 | grep "completions" | grep 8001 | wc -l
    747
    root@MichaelMing:~ # cat /var/log/nginx/access.log | tail -n 20000 | grep "completions" | grep 8002 | wc -l
    755
    • Client call
    import requests
    import asyncio
    import aiohttp

    # Synchronous client (based on requests)
    class SyncAPIClient:
        def __init__(self, base_url):
            self.base_url = base_url

        def call_chat(self, message):
            url = f "{self.base_url}/chat"
            data = { "message" : message}
            response = requests.post(url, json=data)
            return  response.json()

        def call_completions(self, text):
            url = f "{self.base_url}/completions"
            response = requests.post(url, data=text)
            return  response.json()

    # Asynchronous client (based on aiohttp)
    class AsyncAPIClient:
        def __init__(self, base_url):
            self.base_url = base_url

        async def call_chat(self, message):
            url = f "{self.base_url}/chat"
            data = { "message" : message}
            async with aiohttp.ClientSession() as session:
                async with session.post(url, json=data) as response:
                    return  await response.json()

        async def call_completions(self, text):
            url = f "{self.base_url}/completions"
            async with aiohttp.ClientSession() as session:
                async with session.post(url, data=text) as response:
                    return  await response.json()

    # Synchronous call example
    sync_client = SyncAPIClient( "http://localhost" )
    chat_response = sync_client.call_chat( "Hello, sync chat!" )
    print ( "Chat response: " , chat_response)

    completion_response = sync_client.call_completions( "test text" )
    print ( "Completions response: " , completion_response)

    # Asynchronous call example
    async def main():
        async_client = AsyncAPIClient( "http://localhost" )

        # Concurrently call two interfaces 
        chat_task = asyncio.create_task(async_client.call_chat( "Hello, async chat!" ))
        completion_task = asyncio.create_task(async_client.call_completions( "async test text" ))

        results = await asyncio.gather(chat_task, completion_task)
        print ( "Asynchronous Chat response: " , results[0])
        print ( "Asynchronous Completions response: " , results[1])

    asyncio.run(main())

    Output:

    root@MichaelMing:~ # python3 /mnt/d/opt/client.py
    Chat response: { 'original''Hello, sync chat!''response''Processed by chat API: HELLO, SYNC CHAT!''server' : 8001,  'timestamp''2025-03-02T23:11:54.426626' }
    Completions response: { 'completions' : [{ 'text''Completion 1: test...''index' : 0}, { 'text''Completion 2: text...''index' : 1}],  'server' : 8002,  'timestamp''2025-03-02T23:11:54.430562' }
    Asynchronous Chat response: { 'original''Hello, async chat!''response''Processed by chat API: HELLO, ASYNC CHAT!''server' : 8001,  'timestamp''2025-03-02T23:11:54.444508' }
    Asynchronous Completions response: { 'completions' : [{ 'text''Completion 1: async...''index' : 0}, { 'text''Completion 2: text...''index' : 1}],  'server' : 8002,  'timestamp''2025-03-02T23:11:54.444973' }

    Stress testing with code

    # Concurrent stress testing example (using asynchronous client)
    async def stress_test () :  
        client = AsyncAPIClient( "http://localhost" )
        tasks = []

        for  _  in  range( 100 ):
            tasks.append(client.call_chat( "test message" ))
            tasks.append(client.call_completions( "test text" ))

        responses =  await  asyncio.gather(*tasks)
        print( f"Successfully processed  {len(responses)}  requests" )

    asyncio.run(stress_test())