Fix GitHub workflow: .env.llm-tests lost on checkout (#1041)

Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: thomasnordquist <7721625+thomasnordquist@users.noreply.github.com>
Co-authored-by: Thomas Nordquist <thomasnordquist@users.noreply.github.com>
This commit is contained in:
Copilot
2026-01-30 21:58:35 +01:00
committed by GitHub
parent 0cae66de69
commit 35f31973c4
7 changed files with 624 additions and 129 deletions

View File

@@ -1,6 +1,7 @@
import { expect } from 'chai'
import 'mocha'
import { MessageProposal } from '../llmService'
import { MessageProposal, QuestionProposal } from '../llmService'
import axios from 'axios'
/**
* Live LLM Integration Tests
@@ -8,11 +9,12 @@ import { MessageProposal } from '../llmService'
* These tests make actual calls to the LLM API to validate proposal quality.
*
* Requirements:
* - OPENAI_API_KEY environment variable must be set
* - OPENAI_API_KEY, GEMINI_API_KEY, or LLM_API_KEY environment variable must be set
* - RUN_LLM_TESTS environment variable must be set to 'true'
*
* Usage:
* RUN_LLM_TESTS=true OPENAI_API_KEY=sk-... yarn test
* RUN_LLM_TESTS=true GEMINI_API_KEY=... yarn test
*
* These tests are skipped by default to avoid:
* - API costs during regular testing
@@ -23,12 +25,144 @@ import { MessageProposal } from '../llmService'
const shouldRunLLMTests = process.env.RUN_LLM_TESTS === 'true'
const hasApiKey = !!process.env.OPENAI_API_KEY || !!process.env.GEMINI_API_KEY || !!process.env.LLM_API_KEY
// Determine which provider to use
const getProvider = (): 'openai' | 'gemini' | null => {
if (process.env.OPENAI_API_KEY) return 'openai'
if (process.env.GEMINI_API_KEY) return 'gemini'
if (process.env.LLM_API_KEY && process.env.LLM_PROVIDER) {
return process.env.LLM_PROVIDER as 'openai' | 'gemini'
}
return null
}
const provider = getProvider()
const apiKey = process.env.OPENAI_API_KEY || process.env.GEMINI_API_KEY || process.env.LLM_API_KEY
/**
* Helper function to call LLM API directly for testing
*/
async function callLLM(userMessage: string, context?: string): Promise<string> {
const systemMessage = `You are an expert AI assistant specializing in MQTT (Message Queuing Telemetry Transport) protocol and home/industrial automation systems. When you detect controllable devices, propose MQTT messages using this format:
\`\`\`proposal
{
"topic": "the/mqtt/topic",
"payload": "message payload",
"qos": 0,
"description": "Brief description of what this does"
}
\`\`\`
You can include multiple proposals if there are multiple relevant actions.`
const messageContent = context ? `Context:\n${context}\n\nUser Question: ${userMessage}` : userMessage
try {
if (provider === 'openai') {
const response = await axios.post(
'https://api.openai.com/v1/chat/completions',
{
model: 'gpt-4o-mini',
messages: [
{ role: 'system', content: systemMessage },
{ role: 'user', content: messageContent },
],
temperature: 0.7,
max_tokens: 1000,
},
{
headers: {
'Content-Type': 'application/json',
'Authorization': `Bearer ${apiKey}`,
},
timeout: 30000,
}
)
return response.data.choices[0].message.content
} else if (provider === 'gemini') {
// Gemini API implementation with API key in header
// Note: Gemini REST API requires API key in query param as per official docs
// See: https://ai.google.dev/gemini-api/docs/get-started/rest
const response = await axios.post(
`https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash-latest:generateContent?key=${apiKey}`,
{
contents: [
{
parts: [
{ text: `${systemMessage}\n\n${messageContent}` },
],
},
],
generationConfig: {
temperature: 0.7,
maxOutputTokens: 1000,
},
},
{
headers: {
'Content-Type': 'application/json',
},
timeout: 45000, // Gemini can be slower, allow more time
}
)
return response.data.candidates[0].content.parts[0].text
} else {
throw new Error('No valid LLM provider configured')
}
} catch (error: any) {
// Sanitize error logging to avoid exposing sensitive data
const errorMessage = error.response?.data?.error?.message || error.message || 'Unknown error'
const statusCode = error.response?.status
console.error('LLM API call failed:', { statusCode, message: errorMessage })
throw new Error(`LLM API call failed: ${errorMessage}`)
}
}
/**
* Parse LLM response to extract proposals
*/
function parseProposals(response: string): MessageProposal[] {
const proposals: MessageProposal[] = []
const proposalRegex = /```proposal\s*\n([\s\S]*?)\n```/g
let match
while ((match = proposalRegex.exec(response)) !== null) {
try {
const proposalJson = JSON.parse(match[1])
if (proposalJson.topic && proposalJson.payload !== undefined && proposalJson.description) {
proposals.push({
topic: proposalJson.topic,
payload: proposalJson.payload,
qos: proposalJson.qos || 0,
description: proposalJson.description,
})
}
} catch (e) {
console.warn('Failed to parse proposal:', match[1])
}
}
return proposals
}
/**
* Helper function to validate a proposal structure
*/
function validateProposalStructure(proposal: MessageProposal, context: string = '') {
expect(proposal.topic, `${context}: topic should be a string`).to.be.a('string').and.have.length.greaterThan(0)
expect(proposal.payload, `${context}: payload should be a string`).to.be.a('string')
expect(proposal.qos, `${context}: qos should be 0, 1, or 2`).to.be.oneOf([0, 1, 2])
expect(proposal.description, `${context}: description should be a string`).to.be.a('string').and.have.length.greaterThan(0)
}
describe('LLM Integration Tests (Live API)', function () {
// Increase timeout for API calls
this.timeout(30000)
// Increase timeout for API calls (60s for test, up to 45s for API call)
this.timeout(60000)
before(function () {
if (!shouldRunLLMTests) {
console.log('Skipping LLM integration tests: RUN_LLM_TESTS not set to "true"')
console.log('To run these tests: RUN_LLM_TESTS=true OPENAI_API_KEY=sk-... yarn test')
this.skip()
}
if (!hasApiKey) {
@@ -36,216 +170,385 @@ describe('LLM Integration Tests (Live API)', function () {
console.warn('Set OPENAI_API_KEY, GEMINI_API_KEY, or LLM_API_KEY to run these tests')
this.skip()
}
if (!provider) {
console.warn('Skipping LLM integration tests: Could not determine provider')
this.skip()
}
console.log(`Running LLM integration tests with provider: ${provider}`)
})
describe('Home Automation System Detection', () => {
it('should detect zigbee2mqtt topics and propose valid actions', async () => {
// Mock topic structure for a zigbee2mqtt light
// Topic context for a zigbee2mqtt light
const topicContext = `
Topic: zigbee2mqtt/living_room_light
Current Value: {"state": "OFF", "brightness": 100}
Topic Type: zigbee2mqtt device
Child Topics:
- zigbee2mqtt/living_room_light/set
- zigbee2mqtt/living_room_light/get
Value: {"state": "OFF", "brightness": 100}
Related Topics (2):
zigbee2mqtt/living_room_light/set: {}
zigbee2mqtt/living_room_light/availability: online
`
// This test validates that the LLM:
// 1. Recognizes zigbee2mqtt pattern
// 2. Proposes actions with correct topic format
// 3. Uses valid zigbee2mqtt payloads
console.log('\n[TEST] Calling LLM with zigbee2mqtt context...')
const response = await callLLM('How can I turn this light on?', topicContext)
console.log('[TEST] LLM Response length:', response.length)
console.log('[TEST] LLM Response preview:', response.substring(0, 200) + '...')
// In a real test, you would call the LLM service here
// const response = await llmService.sendMessage('How can I turn this on?', topicContext)
// const parsed = llmService.parseResponse(response)
const proposals = parseProposals(response)
console.log('[TEST] Extracted proposals:', proposals.length)
// For now, we validate the expected structure
const expectedProposal: MessageProposal = {
topic: 'zigbee2mqtt/living_room_light/set',
payload: '{"state": "ON"}',
qos: 0,
description: 'Turn on the living room light',
// Should propose at least one action
expect(proposals.length).to.be.greaterThan(0, 'LLM should propose at least one action')
const turnOnProposal = proposals.find(p =>
p.topic.includes('zigbee2mqtt') &&
p.topic.includes('/set') &&
(p.payload.toLowerCase().includes('on') || JSON.stringify(p.payload).toLowerCase().includes('on'))
)
expect(turnOnProposal).to.exist.and.not.be.undefined
if (turnOnProposal) {
// Validate topic format
expect(turnOnProposal.topic).to.match(/^zigbee2mqtt\//, 'Topic should start with zigbee2mqtt/')
expect(turnOnProposal.topic).to.include('/set', 'Topic should include /set')
// Validate payload is valid JSON for zigbee2mqtt
expect(() => JSON.parse(turnOnProposal.payload)).to.not.throw('Payload should be valid JSON')
const payload = JSON.parse(turnOnProposal.payload)
expect(payload).to.have.property('state')
// Validate structure using helper
validateProposalStructure(turnOnProposal, 'zigbee2mqtt turn-on proposal')
console.log('[TEST] Turn on proposal validated successfully:', turnOnProposal)
}
expect(expectedProposal.topic).to.match(/^zigbee2mqtt\//)
expect(expectedProposal.topic).to.include('/set')
expect(() => JSON.parse(expectedProposal.payload)).to.not.throw()
})
it('should detect Home Assistant topics and propose valid actions', async () => {
const topicContext = `
Topic: homeassistant/light/bedroom_lamp/state
Current Value: OFF
Topic Type: Home Assistant
Related Topics:
- homeassistant/light/bedroom_lamp/set
Value: OFF
Related Topics (1):
homeassistant/light/bedroom_lamp/set:
`
const expectedProposal: MessageProposal = {
topic: 'homeassistant/light/bedroom_lamp/set',
payload: 'ON',
qos: 0,
description: 'Turn on the bedroom lamp',
}
console.log('\n[TEST] Calling LLM with Home Assistant context...')
const response = await callLLM('Turn on the bedroom lamp', topicContext)
console.log('[TEST] LLM Response length:', response.length)
expect(expectedProposal.topic).to.match(/^homeassistant\//)
expect(expectedProposal.topic).to.include('/set')
const proposals = parseProposals(response)
console.log('[TEST] Extracted proposals:', proposals.length)
expect(proposals.length).to.be.greaterThan(0, 'LLM should propose at least one action')
const turnOnProposal = proposals.find(p =>
p.topic.includes('homeassistant') &&
p.topic.includes('/set')
)
expect(turnOnProposal).to.exist.and.not.be.undefined
if (turnOnProposal) {
expect(turnOnProposal.topic).to.match(/^homeassistant\//, 'Topic should start with homeassistant/')
expect(turnOnProposal.topic).to.include('/set', 'Topic should include /set')
expect(turnOnProposal.qos).to.be.oneOf([0, 1, 2])
expect(turnOnProposal.description).to.be.a('string').and.have.length.greaterThan(0)
console.log('[TEST] Home Assistant proposal validated successfully:', turnOnProposal)
}
})
it('should detect Tasmota topics and propose valid actions', async () => {
const topicContext = `
Topic: stat/tasmota_switch/POWER
Current Value: OFF
Topic Type: Tasmota device
Related Topics:
- cmnd/tasmota_switch/POWER
- stat/tasmota_switch/RESULT
Value: OFF
Related Topics (2):
cmnd/tasmota_switch/POWER:
stat/tasmota_switch/RESULT: {"POWER":"OFF"}
`
const expectedProposal: MessageProposal = {
topic: 'cmnd/tasmota_switch/POWER',
payload: 'ON',
qos: 0,
description: 'Turn on the Tasmota switch',
}
console.log('\n[TEST] Calling LLM with Tasmota context...')
const response = await callLLM('How do I turn on this switch?', topicContext)
console.log('[TEST] LLM Response length:', response.length)
expect(expectedProposal.topic).to.match(/^cmnd\//)
expect(['ON', 'OFF', 'TOGGLE']).to.include(expectedProposal.payload)
const proposals = parseProposals(response)
console.log('[TEST] Extracted proposals:', proposals.length)
expect(proposals.length).to.be.greaterThan(0, 'LLM should propose at least one action')
const turnOnProposal = proposals.find(p =>
p.topic.startsWith('cmnd/')
)
expect(turnOnProposal).to.exist.and.not.be.undefined
if (turnOnProposal) {
expect(turnOnProposal.topic).to.match(/^cmnd\//, 'Topic should start with cmnd/')
expect(turnOnProposal.payload).to.be.oneOf(['ON', 'OFF', 'TOGGLE', '1', '0'],
'Tasmota payload should be a simple command')
expect(turnOnProposal.qos).to.be.oneOf([0, 1, 2])
expect(turnOnProposal.description).to.be.a('string').and.have.length.greaterThan(0)
console.log('[TEST] Tasmota proposal validated successfully:', turnOnProposal)
}
})
})
describe('Proposal Quality Validation', () => {
it('should propose multiple relevant actions for controllable devices', async () => {
// A good LLM response should include multiple actions:
// - Turn ON
// - Turn OFF
// - Adjust brightness
// - etc.
const topicContext = `
Topic: zigbee2mqtt/dimmable_light
Current Value: {"state": "ON", "brightness": 128, "color_temp": 370}
Value: {"state": "ON", "brightness": 128, "color_temp": 370}
Related Topics (3):
zigbee2mqtt/dimmable_light/set: {}
zigbee2mqtt/dimmable_light/get: {}
zigbee2mqtt/dimmable_light/availability: online
`
// Expected: Multiple proposals for different actions
const expectedProposalCount = 2 // At least ON/OFF
console.log('\n[TEST] Testing multiple action proposals...')
const response = await callLLM('What can I do with this light?', topicContext)
console.log('[TEST] LLM Response length:', response.length)
expect(expectedProposalCount).to.be.at.least(2)
const proposals = parseProposals(response)
console.log('[TEST] Extracted proposals:', proposals.length)
// Should propose multiple actions for a controllable device
expect(proposals.length).to.be.at.least(1, 'LLM should propose at least one action')
// Validate each proposal
proposals.forEach((proposal, index) => {
console.log(`[TEST] Validating proposal ${index + 1}:`, proposal)
expect(proposal.topic).to.be.a('string').and.have.length.greaterThan(0)
expect(proposal.payload).to.be.a('string')
expect(proposal.qos).to.be.oneOf([0, 1, 2])
expect(proposal.description).to.be.a('string').and.have.length.greaterThan(0)
})
})
it('should provide clear, actionable descriptions', async () => {
const proposal: MessageProposal = {
topic: 'home/light/set',
payload: 'ON',
qos: 0,
description: 'Turn on the light',
}
const topicContext = `
Topic: home/light/set
Value: OFF
`
// Description should:
// - Be in imperative form (command)
// - Clearly state what the action does
// - Be under 100 characters
expect(proposal.description).to.match(/^(Turn|Set|Toggle|Switch|Change)/)
expect(proposal.description.length).to.be.lessThan(100)
console.log('\n[TEST] Testing description quality...')
const response = await callLLM('Turn on the light', topicContext)
const proposals = parseProposals(response)
expect(proposals.length).to.be.greaterThan(0)
proposals.forEach((proposal) => {
// Description should be in imperative form (command)
expect(proposal.description).to.match(/^(Turn|Set|Toggle|Switch|Change|Adjust|Control)/i,
'Description should start with an action verb')
// Description should be clear and concise
expect(proposal.description.length).to.be.lessThan(100,
'Description should be under 100 characters')
expect(proposal.description.length).to.be.greaterThan(5,
'Description should be meaningful')
console.log('[TEST] Description validated:', proposal.description)
})
})
it('should match payload format to detected system', async () => {
// zigbee2mqtt typically uses JSON
const zigbeeProposal: MessageProposal = {
topic: 'zigbee2mqtt/device/set',
payload: '{"state": "ON"}',
qos: 0,
description: 'Turn on',
}
// Test zigbee2mqtt (JSON payloads)
const zigbeeContext = `
Topic: zigbee2mqtt/device/set
Value: {"state": "OFF"}
`
// Tasmota typically uses simple strings
const tasmotaProposal: MessageProposal = {
topic: 'cmnd/device/POWER',
payload: 'ON',
qos: 0,
description: 'Turn on',
}
console.log('\n[TEST] Testing zigbee2mqtt payload format...')
const zigbeeResponse = await callLLM('Turn this on', zigbeeContext)
const zigbeeProposals = parseProposals(zigbeeResponse)
expect(() => JSON.parse(zigbeeProposal.payload)).to.not.throw()
expect(['ON', 'OFF', 'TOGGLE']).to.include(tasmotaProposal.payload)
expect(zigbeeProposals.length).to.be.greaterThan(0)
const zigbeeProposal = zigbeeProposals[0]
expect(() => JSON.parse(zigbeeProposal.payload)).to.not.throw('zigbee2mqtt payload should be valid JSON')
console.log('[TEST] zigbee2mqtt proposal:', zigbeeProposal)
// Test Tasmota (simple string payloads)
const tasmotaContext = `
Topic: cmnd/device/POWER
Value: OFF
`
console.log('\n[TEST] Testing Tasmota payload format...')
const tasmotaResponse = await callLLM('Turn this on', tasmotaContext)
const tasmotaProposals = parseProposals(tasmotaResponse)
expect(tasmotaProposals.length).to.be.greaterThan(0)
const tasmotaProposal = tasmotaProposals[0]
// Tasmota typically uses simple strings, but might also use JSON
// Accept both formats
const isSimpleString = ['ON', 'OFF', 'TOGGLE', '1', '0'].includes(tasmotaProposal.payload)
const isValidJSON = (() => {
try { JSON.parse(tasmotaProposal.payload); return true } catch { return false }
})()
expect(isSimpleString || isValidJSON).to.be.true
console.log('[TEST] Tasmota proposal:', tasmotaProposal)
})
})
describe('Edge Cases', () => {
it('should not propose actions for read-only sensors', async () => {
it('should handle read-only sensors appropriately', async () => {
const topicContext = `
Topic: sensors/temperature
Current Value: 23.5
Topic Type: Temperature sensor (read-only)
Value: 23.5
Messages: 1000
`
// LLM should recognize this is read-only and not propose write actions
// This is a qualitative test - the LLM should understand sensor vs actuator
console.log('\n[TEST] Testing read-only sensor handling...')
const response = await callLLM('What can I do with this sensor?', topicContext)
console.log('[TEST] LLM Response length:', response.length)
const proposals = parseProposals(response)
console.log('[TEST] Extracted proposals for sensor:', proposals.length)
// For read-only sensors, the LLM might not propose actions, or might propose monitoring/analysis
// This is not a strict requirement but we validate the response is sensible
if (proposals.length > 0) {
// If proposals are made, they should not be write actions
proposals.forEach(proposal => {
console.log('[TEST] Sensor proposal:', proposal)
// Validate proposal structure even for sensors
expect(proposal.topic).to.be.a('string')
expect(proposal.description).to.be.a('string')
})
}
// The response should acknowledge this is a sensor
expect(response.toLowerCase()).to.match(/sensor|temperature|read|monitor|value/,
'Response should acknowledge sensor nature')
})
it('should handle complex nested topic structures', async () => {
const topicContext = `
Topic: home/rooms/livingroom/devices/light/main
Current Value: {"state": "OFF", "brightness": 0, "color": {"r": 255, "g": 255, "b": 255}}
Value: {"state": "OFF", "brightness": 0, "color": {"r": 255, "g": 255, "b": 255}}
Related Topics (1):
home/rooms/livingroom/devices/light/main/set: {}
`
const proposal: MessageProposal = {
topic: 'home/rooms/livingroom/devices/light/main/set',
payload: '{"state": "ON"}',
qos: 0,
description: 'Turn on the main living room light',
}
console.log('\n[TEST] Testing complex nested topics...')
const response = await callLLM('Turn this light on', topicContext)
const proposals = parseProposals(response)
expect(proposals.length).to.be.greaterThan(0)
const proposal = proposals[0]
// Should handle deep nesting correctly
expect(proposal.topic.split('/')).to.have.length.greaterThan(3)
expect(proposal.topic.split('/')).to.have.length.greaterThan(3,
'Should maintain deep topic structure')
// Should include the full path
expect(proposal.topic).to.include('home/rooms/livingroom')
console.log('[TEST] Complex topic proposal:', proposal)
})
it('should handle topics with special characters', async () => {
const topicContext = `
Topic: home/device-123/sensor_1
Current Value: active
Value: active
Related Topics (1):
home/device-123/sensor_1/control: {}
`
// Should handle hyphens, underscores, numbers
expect('home/device-123/sensor_1').to.match(/^[a-zA-Z0-9/_-]+$/)
console.log('\n[TEST] Testing special characters in topics...')
const response = await callLLM('Control this device', topicContext)
const proposals = parseProposals(response)
if (proposals.length > 0) {
const proposal = proposals[0]
// Should preserve hyphens, underscores, numbers
expect(proposal.topic).to.match(/^[a-zA-Z0-9/_-]+$/,
'Topic should only contain valid MQTT characters')
console.log('[TEST] Special character topic proposal:', proposal)
}
})
})
describe('Question Generation Quality', () => {
it('should generate relevant questions for home automation topics', async () => {
it('should generate relevant follow-up questions', async () => {
const topicContext = `
Topic: zigbee2mqtt/bedroom_light
Current Value: {"state": "OFF", "brightness": 255}
Value: {"state": "OFF", "brightness": 255}
Related Topics (2):
zigbee2mqtt/bedroom_light/set: {}
zigbee2mqtt/bedroom_light/availability: online
`
// Expected questions should be relevant to controllable lights
const expectedQuestions = [
'How can I turn this light on?',
'What is the current brightness level?',
'Can I change the color of this light?',
'How do I set a specific brightness?',
'What commands are available for this device?',
]
console.log('\n[TEST] Testing question generation...')
const response = await callLLM('What is this device?', topicContext)
console.log('[TEST] LLM Response length:', response.length)
// At least some of these topics should be covered
// This is validated in the actual implementation
// Parse question proposals from the response
const questionRegex = /```question-proposal\s*\n([\s\S]*?)\n```/g
const questions: QuestionProposal[] = []
let match
while ((match = questionRegex.exec(response)) !== null) {
try {
const questionJson = JSON.parse(match[1])
if (questionJson.question) {
questions.push({
question: questionJson.question,
category: questionJson.category,
})
}
} catch (e) {
console.warn('Failed to parse question proposal:', match[1])
}
}
console.log('[TEST] Extracted questions:', questions.length)
if (questions.length > 0) {
questions.forEach((q, index) => {
console.log(`[TEST] Question ${index + 1}:`, q)
expect(q.question).to.be.a('string').and.have.length.greaterThan(5)
expect(q.question).to.match(/\?$/, 'Question should end with ?')
if (q.category) {
expect(q.category).to.be.oneOf(['analysis', 'control', 'troubleshooting', 'optimization'])
}
})
}
// The response should be relevant to the device type
expect(response.toLowerCase()).to.match(/light|brightness|control|device/,
'Response should be relevant to the topic')
})
it('should generate analytical questions for sensor data', async () => {
it('should provide informative responses about sensor data', async () => {
const topicContext = `
Topic: sensors/temperature
Current Value: 23.5
Message Count: 1000
Value: 23.5
Messages: 1000
`
const expectedQuestions = [
'What is the temperature trend?',
'What is the average temperature?',
'Are there any anomalies in the data?',
'When was the highest temperature recorded?',
]
console.log('\n[TEST] Testing sensor data analysis...')
const response = await callLLM('Tell me about this sensor', topicContext)
console.log('[TEST] LLM Response length:', response.length)
// Questions should focus on analysis, not control
// Response should mention temperature or sensor
expect(response.toLowerCase()).to.match(/temperature|sensor|value|reading|data/,
'Response should discuss sensor data')
console.log('[TEST] Sensor analysis response preview:', response.substring(0, 200))
})
})
})