File size: 4,753 Bytes
7fefeaf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d9b3577
7fefeaf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d9b3577
7fefeaf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9a56e2f
7fefeaf
0cf119e
7fefeaf
d40d743
0cf119e
7fefeaf
 
 
 
d9b3577
7fefeaf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d40d743
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import express from "express"
import { LLM } from "llama-node"
import { LLamaCpp } from "llama-node/dist/llm/llama-cpp.js"

import { daisy } from "./daisy.mts"
import { alpine } from "./alpine.mts"

const llama = new LLM(LLamaCpp)
await llama.load({
  // If you plan to use a different model you also need to edit line 26 in the Dockerfile
  modelPath: "./models/airoboros-13b-gpt4.ggmlv3.q4_0.bin",
  enableLogging: false,
  nCtx: 1024,
  seed: 0,
  f16Kv: false,
  logitsAll: false,
  vocabOnly: false,
  useMlock: false,
  embedding: false,
  useMmap: true,
  nGpuLayers: 0
})

// define the CSS and JS dependencies
const css = [
  "/css/[email protected]",
].map(item => `<link href="${item}" rel="stylesheet" type="text/css"/>`)
.join("")

const script = [
  "/js/[email protected]",
  "/js/[email protected]"
].map(item => `<script src="${item}"></script>`)
.join("")

const app = express()
const port = 7860

const minPromptSize = 16 // if you change this, you will need to also change in public/index.html
const timeoutInSec = 15 * 60

app.use(express.static("public"))
 
const maxParallelRequests = 1

const pending: {
  total: number;
  queue: string[];
  aborts: Record<string, any>,
} = {
  total: 0,
  queue: [],
  aborts: {},
}
 
const endRequest = (id: string, reason: string) => {
  if (!id || !pending.queue.includes(id)) {
    return
  }
  
  // politely ask the LLM to stop
  try {
    pending.aborts[id].abort()
  } catch (err) {
    console.log(`could not abort request ${id} (${err})`)
  }
  // remove the request from everywhere
  try {
    pending.queue = pending.queue.filter(i => i !== id)
    delete pending.aborts[id]
    console.log(`cleaned up request ${id}`)
  } catch (err) {
    console.log(`failed to properly clean up request ${id}`)
  }
  console.log(`request ${id} ended (${reason})`)
}
app.get("/debug", (req, res) => {
  res.write(JSON.stringify({
    nbTotal: pending.total,
    nbPending: pending.queue.length,
    queue: pending.queue,
  }))
  res.end()
})

app.get("/app", async (req, res) => {

  if (`${req.query.prompt}`.length < minPromptSize) {
    res.write(`prompt too short, please enter at least ${minPromptSize} characters`)
    res.end()
    return
  }
  
  // naive implementation: we say we are out of capacity
  if (pending.queue.length >= maxParallelRequests) {
    res.write('Sorry, max nb of parallel requests reached. A new slot should be available in < 15 min.')
    res.end()
    return
  }
  // alternative approach: kill old queries
  // while (pending.queue.length > maxParallelRequests) {
  //   endRequest(pending.queue[0], 'max nb of parallel request reached')
  // }

  const id = `${pending.total++}`
  console.log(`new request ${id}`)

  pending.queue.push(id)
  pending.aborts[id] = new AbortController() 

  const prefix = `<html><head>${css}${script}`
  res.write(prefix)

  req.on("close", function() {
    endRequest(id, "browser ended the connection")
  })

  // for testing we kill after some delay
  setTimeout(() => {
    endRequest(id, `timed out after ${timeoutInSec}s`)
  }, timeoutInSec * 1000)


  const finalPrompt = `# Context
Generate this webapp: ${req.query.prompt}.
# Documentation
${daisy}
# Guidelines
- Never repeat the instruction, instead directly write the final code within a script tag
- Use a color scheme consistent with the brief and theme
- You need to use Tailwind CSS and DaisyUI for the UI. Do not use JS for simple pages (eg. blogs or articles).
- All the JS code will be written directly inside the page, using <script type="text/javascript">...</script>
- You MUST use English not Latin! I repeat: do NOT write lorem ipsum!
- No need to write code comments, and try to make the code compact (short function names etc)
- Use a central layout by wrapping everything in a \`<div class="flex flex-col items-center">\`
# HTML Code
${prefix}`

  const options = {
    prompt: finalPrompt,
    nThreads: 6, // try to use the most of our vCPUs
    nTokPredict: 1024,
    topK: 40,
    topP: 0.1,
    temp: 0.3,
    repeatPenalty: 1,
  }
      
  try {
    await llama.createCompletion(options, (response) => {
      try {
        res.write(response.token)
      } catch (err) {
        console.log(`coudln't write the LLM response to the HTTP stream ${err}`)
      }
    }, pending.aborts[id].signal)
    endRequest(id, `normal end of the llama stream for request ${id}`)
  } catch (e) {
    endRequest(id, `premature end of the llama stream for request ${id} (${e})`)
  } 

  try {
    res.end()
  } catch (err) {
    console.log(`couldn't end the HTTP stream for request ${id} (${err})`)
  }
  
})

app.listen(port, () => { console.log(`Open http://localhost:${port}/?prompt=a%20webpage%20recipe%20for%20making%20chocolate%20chip%20cookies`) })