const express = require('express'); const axios = require('axios'); const xml2js = require('xml2js'); const cheerio = require('cheerio'); const app = express(); const port = process.env.PORT || 8080; // Middleware to parse JSON requests app.use(express.json()); function textSplitter(text, chunkSize, overlap) { if (text.length <= chunkSize) { return [text]; } let end = chunkSize; // Find the position to split the text (avoiding word boundary) while (end > 0 && text.charAt(end) !== ' ') { end--; } // Find the position for overlap to split the text (avoiding word boundary) let from = end - overlap; while (from > 0 && text.charAt(from) !== ' ') { from--; } const chunk = text.substring(0, end).trim(); const remainingText = text.substring(from).trim(); // Adjust for overlap return [chunk, ...textSplitter(remainingText, chunkSize, overlap)]; } // REST API endpoint app.post('/extract', async (req, res) => { const { source, selector, chunkSize, chunkOverlap } = req.body; let selectedContent = ''; let chunks = []; try { const response = await axios.get(source); const htmlContent = response.data; try { const $ = cheerio.load(htmlContent); selectedContent = $(selector).text().trim().replace(/<(?:"[^"]*"['"]*|'[^']*'['"]*|[^'">])+>/g, ''); chunks = textSplitter(selectedContent, chunkSize, chunkOverlap) } catch(err) { console.error(err); } res.json({content: selectedContent, chunks}) } catch (error) { console.error(error); res.status(500).json({ error: `Error fetching content from ${source}: ${error.message}` }); } }); // REST API endpoint app.post('/split', async (req, res) => { const { content, chunkSize, chunkOverlap } = req.body; try { const selectedContent = content.trim().replace(/<(?:"[^"]*"['"]*|'[^']*'['"]*|[^'">])+>/g, ''); const chunks = textSplitter(selectedContent, chunkSize, chunkOverlap) res.json({content: selectedContent, chunks}) } catch (error) { console.error(error); res.status(500).json({ error: `Error splitting content: ${error.message}` }); } }); // Start the server app.listen(port, async () => { console.log(`Server is running on port ${port}`); });