akhaliq HF Staff commited on
Commit
6d86572
·
1 Parent(s): f26e8e5

update glm to use hf inference with novita

Browse files
Files changed (1) hide show
  1. app.py +25 -296
app.py CHANGED
@@ -413,7 +413,7 @@ AVAILABLE_MODELS = [
413
  },
414
  {
415
  "name": "GLM-4.5",
416
- "id": "GLM-4.5",
417
  "description": "GLM-4.5 model with thinking capabilities for advanced code generation"
418
  },
419
  {
@@ -1541,224 +1541,8 @@ The HTML code above contains the complete original website structure with all im
1541
  return f"Error extracting website content: {str(e)}"
1542
 
1543
 
1544
- # GLM-4.5 Model Implementation
1545
  stop_generation = False
1546
 
1547
- def stream_from_vllm(messages, thinking_enabled=True, temperature=1.0):
1548
- global stop_generation
1549
-
1550
- # Get GLM API configuration from environment variables
1551
- glm_api_key = os.getenv('OPENAI_API_KEY')
1552
- glm_base_url = os.getenv('GLM_BASE_URL', 'https://open.bigmodel.cn/api/paas/v4/')
1553
-
1554
- if not glm_api_key:
1555
- # Return configuration error if no API key
1556
- error_msg = """
1557
- GLM-4.5 API Key Not Configured
1558
-
1559
- To use GLM-4.5, please:
1560
- 1. Get your API key from: https://open.bigmodel.cn/
1561
- 2. Set environment variable: OPENAI_API_KEY=your_api_key_here
1562
- 3. Optionally set GLM_BASE_URL if using different endpoint
1563
-
1564
- Example HTML code generation with Gradio:
1565
- ```html
1566
- <!DOCTYPE html>
1567
- <html lang="en">
1568
- <head>
1569
- <meta charset="UTF-8">
1570
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
1571
- <title>Sample App</title>
1572
- <style>
1573
- body { font-family: Arial, sans-serif; margin: 40px; }
1574
- .container { max-width: 600px; margin: 0 auto; }
1575
- h1 { color: #333; }
1576
- .button {
1577
- background: #007acc;
1578
- color: white;
1579
- padding: 10px 20px;
1580
- border: none;
1581
- border-radius: 5px;
1582
- cursor: pointer;
1583
- }
1584
- </style>
1585
- </head>
1586
- <body>
1587
- <div class="container">
1588
- <h1>GLM-4.5 Configuration Required</h1>
1589
- <p>Please configure your GLM-4.5 API key to use this model.</p>
1590
- <button class="button" onclick="alert('Configure OPENAI_API_KEY environment variable')">Get Started</button>
1591
- </div>
1592
- </body>
1593
- </html>
1594
- ```
1595
- """
1596
- yield type('Delta', (), {'content': error_msg, 'reasoning_content': None})()
1597
- return
1598
-
1599
- # Configure OpenAI client for GLM-4.5
1600
- try:
1601
- client = OpenAI(
1602
- base_url=glm_base_url,
1603
- api_key=glm_api_key,
1604
- )
1605
-
1606
- response = client.chat.completions.create(
1607
- model="GLM-4.5",
1608
- messages=messages,
1609
- temperature=temperature,
1610
- stream=True,
1611
- max_tokens=65536,
1612
- extra_body={
1613
- "thinking": {
1614
- "type": "enabled" if thinking_enabled else "disabled",
1615
- }
1616
- }
1617
- )
1618
-
1619
- for chunk in response:
1620
- if stop_generation:
1621
- break
1622
- if chunk.choices and chunk.choices[0].delta:
1623
- yield chunk.choices[0].delta
1624
-
1625
- except Exception as e:
1626
- # Fallback: if GLM-4.5 API fails, yield error with sample code
1627
- error_msg = f"""Error connecting to GLM-4.5: {str(e)}
1628
-
1629
- Please check:
1630
- 1. OPENAI_API_KEY environment variable is set correctly
1631
- 2. API key is valid and has credits
1632
- 3. Network connection is working
1633
- 4. GLM_BASE_URL is correct (current: {glm_base_url})
1634
-
1635
- Here's a sample HTML code to test the UI:
1636
- ```html
1637
- <!DOCTYPE html>
1638
- <html lang="en">
1639
- <head>
1640
- <meta charset="UTF-8">
1641
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
1642
- <title>GLM-4.5 Error - Sample Output</title>
1643
- <style>
1644
- body {{
1645
- font-family: Arial, sans-serif;
1646
- margin: 40px;
1647
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
1648
- color: white;
1649
- }}
1650
- .container {{
1651
- max-width: 600px;
1652
- margin: 0 auto;
1653
- background: rgba(255,255,255,0.1);
1654
- padding: 30px;
1655
- border-radius: 15px;
1656
- backdrop-filter: blur(10px);
1657
- }}
1658
- h1 {{ color: #fff; text-align: center; }}
1659
- .error {{ background: rgba(255,0,0,0.2); padding: 15px; border-radius: 8px; margin: 20px 0; }}
1660
- .button {{
1661
- background: rgba(255,255,255,0.2);
1662
- color: white;
1663
- padding: 12px 24px;
1664
- border: 1px solid rgba(255,255,255,0.3);
1665
- border-radius: 8px;
1666
- cursor: pointer;
1667
- display: block;
1668
- margin: 20px auto;
1669
- }}
1670
- .button:hover {{ background: rgba(255,255,255,0.3); }}
1671
- </style>
1672
- </head>
1673
- <body>
1674
- <div class="container">
1675
- <h1>🤖 GLM-4.5 Configuration Error</h1>
1676
- <div class="error">
1677
- <strong>Error:</strong> {str(e)}
1678
- </div>
1679
- <p>This is a sample HTML output to demonstrate the UI while you configure GLM-4.5.</p>
1680
- <button class="button" onclick="window.open('https://open.bigmodel.cn/', '_blank')">Configure GLM-4.5 API</button>
1681
- </div>
1682
- <script>
1683
- console.log('GLM-4.5 API Error: {str(e)}');
1684
- </script>
1685
- </body>
1686
- </html>
1687
- ```"""
1688
- print(f"GLM-4.5 API Error: {e}")
1689
- yield type('Delta', (), {'content': error_msg, 'reasoning_content': None})()
1690
-
1691
-
1692
- class GLM45Model:
1693
- def __init__(self):
1694
- self.accumulated_content = ""
1695
- self.accumulated_reasoning = ""
1696
-
1697
- def reset_state(self):
1698
- self.accumulated_content = ""
1699
- self.accumulated_reasoning = ""
1700
-
1701
- def _render_response(self, reasoning_content, regular_content, skip_think=False):
1702
- html_parts = []
1703
-
1704
- if reasoning_content and not skip_think:
1705
- reasoning_escaped = html.escape(reasoning_content).replace("\n", "<br>")
1706
- think_html = (
1707
- "<details open><summary style='cursor:pointer;font-weight:bold;color:#007acc;'>Thinking</summary>"
1708
- "<div style='color:#555555;line-height:1.6;padding:15px;border-left:4px solid #007acc;margin:10px 0;background-color:#f0f7ff;border-radius:4px;'>"
1709
- + reasoning_escaped +
1710
- "</div></details>"
1711
- )
1712
- html_parts.append(think_html)
1713
-
1714
- if regular_content:
1715
- content_escaped = html.escape(regular_content).replace("\n", "<br>")
1716
- content_html = f"<div style='margin:0.5em 0; white-space: pre-wrap; line-height:1.6;'>{content_escaped}</div>"
1717
- html_parts.append(content_html)
1718
-
1719
- return "".join(html_parts)
1720
-
1721
- def _build_messages(self, raw_hist, sys_prompt):
1722
- msgs = []
1723
- if sys_prompt.strip():
1724
- msgs.append({"role": "system", "content": sys_prompt.strip()})
1725
-
1726
- for h in raw_hist:
1727
- if h["role"] == "user":
1728
- msgs.append({"role": "user", "content": h["content"]})
1729
- else:
1730
- msg = {"role": "assistant", "content": h.get("content", "")}
1731
- if h.get("reasoning_content"):
1732
- msg["reasoning_content"] = h.get("reasoning_content")
1733
- msgs.append(msg)
1734
- return msgs
1735
-
1736
- def stream_generate(self, raw_hist, sys_prompt, thinking_enabled=True, temperature=1.0):
1737
- global stop_generation
1738
- stop_generation = False
1739
- msgs = self._build_messages(raw_hist, sys_prompt)
1740
- self.reset_state()
1741
-
1742
- try:
1743
- for delta in stream_from_vllm(msgs, thinking_enabled, temperature):
1744
- if stop_generation:
1745
- break
1746
-
1747
- if hasattr(delta, 'content') and delta.content:
1748
- self.accumulated_content += delta.content
1749
-
1750
- if hasattr(delta, 'reasoning_content') and delta.reasoning_content:
1751
- self.accumulated_reasoning += delta.reasoning_content
1752
-
1753
- yield self._render_response(self.accumulated_reasoning, self.accumulated_content, not thinking_enabled)
1754
-
1755
- except Exception as e:
1756
- yield self._render_response("", f"Error: {str(e)}")
1757
-
1758
-
1759
- # Global GLM-4.5 instance
1760
- glm45 = GLM45Model()
1761
-
1762
 
1763
  def generation_code(query: Optional[str], image: Optional[gr.Image], file: Optional[str], website_url: Optional[str], _setting: Dict[str, str], _history: Optional[History], _current_model: Dict, enable_search: bool = False, language: str = "html", provider: str = "auto"):
1764
  if query is None:
@@ -1841,97 +1625,47 @@ This will help me create a better design for you."""
1841
  # Enhance query with search if enabled
1842
  enhanced_query = enhance_query_with_search(query, enable_search)
1843
 
1844
- # Check if this is GLM-4.5 model and handle differently
1845
- if _current_model["id"] == "GLM-4.5":
1846
- # For GLM-4.5, use the specialized implementation with simpler streaming
1847
  if image is not None:
1848
  messages.append(create_multimodal_message(enhanced_query, image))
1849
  else:
1850
  messages.append({'role': 'user', 'content': enhanced_query})
1851
 
1852
- content = ""
1853
- reasoning_content = ""
1854
-
1855
  try:
1856
- # Use GLM-4.5 streaming directly
1857
- for delta in stream_from_vllm(messages, True, 1.0):
1858
- if stop_generation:
1859
- break
1860
-
1861
- if hasattr(delta, 'content') and delta.content:
1862
- content += delta.content
1863
-
1864
- if hasattr(delta, 'reasoning_content') and delta.reasoning_content:
1865
- reasoning_content += delta.reasoning_content
1866
-
1867
- # Show streaming content (extract just the code part)
1868
- clean_code = remove_code_block(content)
1869
- search_status = " (with web search)" if enable_search and tavily_client else ""
1870
-
1871
- # Handle different language outputs for GLM-4.5 during streaming
1872
- if language == "transformers.js":
1873
- files = parse_transformers_js_output(clean_code)
1874
- if files['index.html'] and files['index.js'] and files['style.css']:
1875
- formatted_output = format_transformers_js_output(files)
1876
- yield {
1877
- code_output: gr.update(value=formatted_output, language="html"),
1878
- history_output: history_to_chatbot_messages(_history),
1879
- sandbox: send_to_sandbox(files['index.html']) if files['index.html'] else "<div style='padding:1em;color:#888;text-align:center;'>Preview is only available for HTML. Please download your code using the download button above.</div>",
1880
- }
1881
- else:
1882
- yield {
1883
- code_output: gr.update(value=clean_code, language="html"),
1884
- history_output: history_to_chatbot_messages(_history),
1885
- sandbox: "<div style='padding:1em;color:#888;text-align:center;'>Generating transformers.js app...</div>",
1886
- }
1887
- elif language == "svelte":
1888
  yield {
1889
- code_output: gr.update(value=clean_code, language="html"),
1890
  history_output: history_to_chatbot_messages(_history),
1891
- sandbox: "<div style='padding:1em;color:#888;text-align:center;'>Generating Svelte app...</div>",
1892
  }
1893
- else:
1894
- if has_existing_content:
1895
- if clean_code.strip().startswith("<!DOCTYPE html>") or clean_code.strip().startswith("<html"):
1896
- yield {
1897
- code_output: gr.update(value=clean_code, language=get_gradio_language(language)),
1898
- history_output: history_to_chatbot_messages(_history),
1899
- sandbox: send_to_sandbox(clean_code) if language == "html" else "<div style='padding:1em;color:#888;text-align:center;'>Preview is only available for HTML. Please download your code using the download button above.</div>",
1900
- }
1901
- else:
1902
- last_content = _history[-1][1] if _history and len(_history[-1]) > 1 else ""
1903
- modified_content = apply_search_replace_changes(last_content, clean_code)
1904
- clean_content = remove_code_block(modified_content)
1905
- yield {
1906
- code_output: gr.update(value=clean_content, language=get_gradio_language(language)),
1907
- history_output: history_to_chatbot_messages(_history),
1908
- sandbox: send_to_sandbox(clean_content) if language == "html" else "<div style='padding:1em;color:#888;text-align:center;'>Preview is only available for HTML. Please download your code using the download button above.</div>",
1909
- }
1910
- else:
1911
- yield {
1912
- code_output: gr.update(value=clean_code, language=get_gradio_language(language)),
1913
- history_output: history_to_chatbot_messages(_history),
1914
- sandbox: send_to_sandbox(clean_code) if language == "html" else "<div style='padding:1em;color:#888;text-align:center;'>Preview is only available for HTML. Please download your code using the download button above.</div>",
1915
- }
1916
-
1917
  except Exception as e:
1918
- content = f"Error: {str(e)}"
1919
- print(f"GLM-4.5 Error: {e}")
1920
 
1921
- # Final processing for GLM-4.5
1922
  clean_code = remove_code_block(content)
 
1923
 
1924
- # Store content with thinking information if available
1925
- if reasoning_content:
1926
- full_response = f"**Thinking:**\n{reasoning_content}\n\n**Code:**\n{content}"
1927
- else:
1928
- full_response = content
1929
-
1930
  if language == "transformers.js":
1931
  files = parse_transformers_js_output(clean_code)
1932
  if files['index.html'] and files['index.js'] and files['style.css']:
1933
  formatted_output = format_transformers_js_output(files)
1934
- _history.append([query, full_response])
1935
  yield {
1936
  code_output: formatted_output,
1937
  history: _history,
@@ -1939,7 +1673,6 @@ This will help me create a better design for you."""
1939
  history_output: history_to_chatbot_messages(_history),
1940
  }
1941
  else:
1942
- _history.append([query, full_response])
1943
  yield {
1944
  code_output: clean_code,
1945
  history: _history,
@@ -1950,7 +1683,6 @@ This will help me create a better design for you."""
1950
  files = parse_svelte_output(clean_code)
1951
  if files['src/App.svelte'] and files['src/app.css']:
1952
  formatted_output = format_svelte_output(files)
1953
- _history.append([query, full_response])
1954
  yield {
1955
  code_output: formatted_output,
1956
  history: _history,
@@ -1958,7 +1690,6 @@ This will help me create a better design for you."""
1958
  history_output: history_to_chatbot_messages(_history),
1959
  }
1960
  else:
1961
- _history.append([query, full_response])
1962
  yield {
1963
  code_output: clean_code,
1964
  history: _history,
@@ -1970,7 +1701,6 @@ This will help me create a better design for you."""
1970
  last_content = _history[-1][1] if _history and len(_history[-1]) > 1 else ""
1971
  modified_content = apply_search_replace_changes(last_content, clean_code)
1972
  clean_content = remove_code_block(modified_content)
1973
- _history.append([query, full_response])
1974
  yield {
1975
  code_output: clean_content,
1976
  history: _history,
@@ -1978,7 +1708,6 @@ This will help me create a better design for you."""
1978
  history_output: history_to_chatbot_messages(_history),
1979
  }
1980
  else:
1981
- _history.append([query, full_response])
1982
  yield {
1983
  code_output: clean_code,
1984
  history: _history,
 
413
  },
414
  {
415
  "name": "GLM-4.5",
416
+ "id": "zai-org/GLM-4.5",
417
  "description": "GLM-4.5 model with thinking capabilities for advanced code generation"
418
  },
419
  {
 
1541
  return f"Error extracting website content: {str(e)}"
1542
 
1543
 
 
1544
  stop_generation = False
1545
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1546
 
1547
  def generation_code(query: Optional[str], image: Optional[gr.Image], file: Optional[str], website_url: Optional[str], _setting: Dict[str, str], _history: Optional[History], _current_model: Dict, enable_search: bool = False, language: str = "html", provider: str = "auto"):
1548
  if query is None:
 
1625
  # Enhance query with search if enabled
1626
  enhanced_query = enhance_query_with_search(query, enable_search)
1627
 
1628
+ # Check if this is GLM-4.5 model and handle with simple HuggingFace InferenceClient
1629
+ if _current_model["id"] == "zai-org/GLM-4.5":
 
1630
  if image is not None:
1631
  messages.append(create_multimodal_message(enhanced_query, image))
1632
  else:
1633
  messages.append({'role': 'user', 'content': enhanced_query})
1634
 
 
 
 
1635
  try:
1636
+ client = InferenceClient(
1637
+ provider="auto",
1638
+ api_key=os.environ["HF_TOKEN"],
1639
+ bill_to="huggingface",
1640
+ )
1641
+
1642
+ stream = client.chat.completions.create(
1643
+ model="zai-org/GLM-4.5",
1644
+ messages=messages,
1645
+ stream=True,
1646
+ )
1647
+
1648
+ content = ""
1649
+ for chunk in stream:
1650
+ if chunk.choices[0].delta.content:
1651
+ content += chunk.choices[0].delta.content
1652
+ clean_code = remove_code_block(content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1653
  yield {
1654
+ code_output: gr.update(value=clean_code, language=get_gradio_language(language)),
1655
  history_output: history_to_chatbot_messages(_history),
1656
+ sandbox: send_to_sandbox(clean_code) if language == "html" else "<div style='padding:1em;color:#888;text-align:center;'>Preview is only available for HTML. Please download your code using the download button above.</div>",
1657
  }
1658
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1659
  except Exception as e:
1660
+ content = f"Error with GLM-4.5: {str(e)}\n\nPlease make sure HF_TOKEN environment variable is set."
 
1661
 
 
1662
  clean_code = remove_code_block(content)
1663
+ _history.append([query, content])
1664
 
 
 
 
 
 
 
1665
  if language == "transformers.js":
1666
  files = parse_transformers_js_output(clean_code)
1667
  if files['index.html'] and files['index.js'] and files['style.css']:
1668
  formatted_output = format_transformers_js_output(files)
 
1669
  yield {
1670
  code_output: formatted_output,
1671
  history: _history,
 
1673
  history_output: history_to_chatbot_messages(_history),
1674
  }
1675
  else:
 
1676
  yield {
1677
  code_output: clean_code,
1678
  history: _history,
 
1683
  files = parse_svelte_output(clean_code)
1684
  if files['src/App.svelte'] and files['src/app.css']:
1685
  formatted_output = format_svelte_output(files)
 
1686
  yield {
1687
  code_output: formatted_output,
1688
  history: _history,
 
1690
  history_output: history_to_chatbot_messages(_history),
1691
  }
1692
  else:
 
1693
  yield {
1694
  code_output: clean_code,
1695
  history: _history,
 
1701
  last_content = _history[-1][1] if _history and len(_history[-1]) > 1 else ""
1702
  modified_content = apply_search_replace_changes(last_content, clean_code)
1703
  clean_content = remove_code_block(modified_content)
 
1704
  yield {
1705
  code_output: clean_content,
1706
  history: _history,
 
1708
  history_output: history_to_chatbot_messages(_history),
1709
  }
1710
  else:
 
1711
  yield {
1712
  code_output: clean_code,
1713
  history: _history,