Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: auto-extract data #192

Draft
wants to merge 39 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
d7cd86e
feat: add capture list manual and auto options buttons
RohitR311 Nov 20, 2024
9cafb6a
feat: getListAuto browser action
amhsirak Nov 20, 2024
45a2534
feat: remove handle pair delete
amhsirak Nov 20, 2024
8315d85
feat: invoke startGetListAuto on handleAutoCapture
amhsirak Nov 20, 2024
17e14d1
feat: handle get list auto for highlighter
amhsirak Nov 20, 2024
dcea137
feat: add listAuto functions
RohitR311 Nov 20, 2024
6e9d4e5
feat: add getListAuto context
RohitR311 Nov 20, 2024
82ba3fc
feat: add getListAuto functionality
RohitR311 Nov 20, 2024
a80760a
feat: add handle list auto settings object
RohitR311 Nov 20, 2024
c51e42d
feat: emit setGetListAuto socket event
amhsirak Nov 20, 2024
70de2a7
feat: send highlighter data for get list auto
amhsirak Nov 20, 2024
517216f
feat: set get list auto true
amhsirak Nov 20, 2024
6db7a19
feat: extract child data
amhsirak Nov 20, 2024
4af621f
feat: get data as per importance
amhsirak Nov 21, 2024
86ba1c8
feat: importance by element
amhsirak Nov 21, 2024
eb5ba66
feat: handle a and img urls
amhsirak Nov 21, 2024
ab16ae3
feat: generate identical non unique selectors
amhsirak Nov 21, 2024
b406dd7
chore: whitespace cleanup
amhsirak Nov 21, 2024
85f3c61
feat: handle selector null check
amhsirak Nov 21, 2024
38a1e56
chore: remove console log
amhsirak Nov 21, 2024
1410616
feat: get list auto ref
amhsirak Nov 21, 2024
baa7acf
feat: pass getListAuto ref in useEffect
amhsirak Nov 21, 2024
339de5b
feat: pass getListAuto in dependency array
amhsirak Nov 21, 2024
2f03f96
feat(temp): -rm getListAuto standalone
amhsirak Nov 21, 2024
086ffce
feat(temp): -rm setGetListAuto socket events
amhsirak Nov 21, 2024
3daa890
chore: lint
amhsirak Nov 21, 2024
a5455a6
feat: remove attribute selection for getListAuto
amhsirak Nov 21, 2024
7b6b4ee
feat: remove getListAuto condition for generic modal
amhsirak Nov 21, 2024
2d8ab35
feat: emit socket event for getList
amhsirak Nov 21, 2024
0a0e27e
feat: auto populate getListAuto fields
amhsirak Nov 21, 2024
e189685
feat: auto populate getListAuto fields
amhsirak Nov 21, 2024
fcc0a13
feat: return attribute and tag in auto extract
amhsirak Nov 21, 2024
a30c3f2
feat: auto extract UI
amhsirak Nov 22, 2024
73f2e98
wip: error handling
amhsirak Nov 22, 2024
2400aa3
feat: use child attribute & tag
amhsirak Nov 22, 2024
101b9b2
Merge branch 'develop' into add-capturelist-ui
amhsirak Nov 22, 2024
b92cfbf
fix: resolve merge conflicts
amhsirak Nov 22, 2024
099c83e
Merge branch 'add-capturelist-ui' of https://github.com/getmaxun/maxu…
amhsirak Nov 22, 2024
bf21b7f
fix: resolve merge conflicts
amhsirak Nov 22, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 20 additions & 4 deletions server/src/workflow-management/classes/Generator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ import {
getChildSelectors,
getNonUniqueSelectors,
isRuleOvershadowing,
selectorAlreadyInWorkflow
selectorAlreadyInWorkflow,
extractChildData
} from "../selector";
import { CustomActions } from "../../../../src/shared/types";
import { workflow } from "../../routes";
Expand Down Expand Up @@ -61,6 +62,8 @@ export class WorkflowGenerator {
*/
private getList: boolean = false;

// private getListAuto: boolean = false;

private listSelector: string = '';

/**
Expand Down Expand Up @@ -116,6 +119,9 @@ export class WorkflowGenerator {
this.socket.on('setGetList', (data: { getList: boolean }) => {
this.getList = data.getList;
});
// this.socket.on('setGetListAuto', (data: { getListAuto: boolean }) => {
// this.getListAuto = data.getListAuto;
// });
this.socket.on('listSelector', (data: { selector: string }) => {
this.listSelector = data.selector;
})
Expand Down Expand Up @@ -559,12 +565,22 @@ export class WorkflowGenerator {
if (this.listSelector !== '') {
const childSelectors = await getChildSelectors(page, this.listSelector || '');
this.socket.emit('highlighter', { rect, selector: displaySelector, elementInfo, childSelectors })
console.log(`Child Selectors: ${childSelectors}`)
console.log(`Parent Selector: ${this.listSelector}`)
} else {
this.socket.emit('highlighter', { rect, selector: displaySelector, elementInfo });
}
} else {
}
// else if (this.getListAuto === true) {
// if (this.listSelector !== '') {
// console.log(`list selector is: ${this.listSelector}`)
// const childData = await extractChildData(page, this.listSelector || '');
// console.log(`Data From Backend: ${JSON.stringify({ rect, selector: displaySelector, elementInfo, childData })}`)
// this.socket.emit('highlighter', { rect, selector: displaySelector, elementInfo, childData });
// }
// else {
// this.socket.emit('highlighter', { ayo:'ayo', rect, selector: displaySelector, elementInfo });
// }
// }
else {
this.socket.emit('highlighter', { rect, selector: displaySelector, elementInfo });
}
}
Expand Down
141 changes: 141 additions & 0 deletions server/src/workflow-management/selector.ts
Original file line number Diff line number Diff line change
Expand Up @@ -790,6 +790,147 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates
}
};

export const extractChildData = async (
page: Page,
parentSelector: string
): Promise<{ data: string; selector: string; attribute: string; tag: string }[]> => {
try {
const baseURL = new URL(page.url());

const uniqueData = await page.evaluate(({ parentSelector, baseHref }: { parentSelector: string, baseHref: string }) => {
interface ElementData {
data: string;
selector: string;
attribute: string; // New field for the attribute
tag: string; // New field for the tag name
importance: number; // Used internally but excluded in the return
}

function getNonUniqueSelector(element: HTMLElement): string {
let selector = element.tagName.toLowerCase();

const className = typeof element.className === 'string' ? element.className : '';
if (className) {
const classes = className.split(/\s+/).filter((cls: string) => Boolean(cls));
if (classes.length > 0) {
const validClasses = classes.filter((cls: string) => !cls.startsWith('!') && !cls.includes(':'));
if (validClasses.length > 0) {
selector += '.' + validClasses.map(cls => CSS.escape(cls)).join('.');
}
}
}

return selector;
}

function determineImportance(element: HTMLElement): number {
if (['h1', 'h2', 'h3', 'h4', 'h5', 'h6'].includes(element.tagName.toLowerCase())) {
return 2;
} else if (['p', 'span', 'a', 'img'].includes(element.tagName.toLowerCase())) {
return 1;
}
return 0;
}

function cleanText(text: string): string {
return text.replace(/\s+/g, ' ').trim();
}

function resolveURL(url: string | null, baseHref: string): string | null {
if (!url) return null;
try {
return new URL(url, baseHref).href;
} catch {
return url;
}
}

function extractElementData(element: HTMLElement, baseHref: string): ElementData[] {
const selector = getNonUniqueSelector(element);
const importance = determineImportance(element);
const tag = element.tagName.toLowerCase();
const results: ElementData[] = [];

// Include text content if importance is sufficient
if (importance >= 1) {
const textContent = cleanText(element.textContent || '');
if (textContent) {
results.push({ data: textContent, selector, attribute: 'innerText', tag, importance });
}
}

// Handle links (a tags)
if (tag === 'a') {
const href = element.getAttribute('href');
if (href) {
const resolvedHref = resolveURL(href, baseHref);
if (resolvedHref) {
results.push({ data: resolvedHref, selector, attribute: 'href', tag, importance });
}
}
}

// Handle images (img tags)
if (tag === 'img') {
const src = element.getAttribute('src');
if (src) {
const resolvedSrc = resolveURL(src, baseHref);
if (resolvedSrc) {
results.push({ data: resolvedSrc, selector, attribute: 'src', tag, importance });
}
}
}

return results;
}

function getAllDescendantData(element: HTMLElement): ElementData[] {
let data: ElementData[] = [];
const children = Array.from(element.children) as HTMLElement[];

for (const child of children) {
data = data.concat(extractElementData(child, baseHref));
data = data.concat(getAllDescendantData(child));
}

return data;
}

const parentElement = document.querySelector(parentSelector) as HTMLElement;
if (!parentElement) return [];

const allData = getAllDescendantData(parentElement);

// Deduplicate and prioritize by importance
const uniqueData = Array.from(
allData.reduce((map, item) => {
if (!map.has(item.data)) {
map.set(item.data, item); // Add new data
} else {
const existing = map.get(item.data);
if (existing && item.importance > existing.importance) {
map.set(item.data, item); // Replace with higher importance
}
}
return map;
}, new Map<string, ElementData>())
);

// Remove importance field before returning
return uniqueData.map(([_, item]) => ({
data: item.data,
selector: item.selector,
attribute: item.attribute,
tag: item.tag,
}));
}, { parentSelector, baseHref: baseURL.href });

return uniqueData;
} catch (error) {
console.error('Error in extractChildData:', error);
return [];
}
};

export const getChildSelectors = async (page: Page, parentSelector: string): Promise<string[]> => {
try {
Expand Down
11 changes: 8 additions & 3 deletions src/components/atoms/canvas.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,10 @@ const Canvas = ({ width, height, onCreateRef }: CanvasProps) => {
const canvasRef = useRef<HTMLCanvasElement>(null);
const { socket } = useSocketStore();
const { setLastAction, lastAction } = useGlobalInfoStore();
const { getText, getList } = useActionContext();
const { getText, getList, getListAuto } = useActionContext();
const getTextRef = useRef(getText);
const getListRef = useRef(getList);
const getListAutoRef = useRef(getListAuto);

const notifyLastAction = (action: string) => {
if (lastAction !== action) {
Expand All @@ -42,7 +43,8 @@ const Canvas = ({ width, height, onCreateRef }: CanvasProps) => {
useEffect(() => {
getTextRef.current = getText;
getListRef.current = getList;
}, [getText, getList]);
getListAutoRef.current = getListAuto;
}, [getText, getList, getListAuto]);

const onMouseEvent = useCallback((event: MouseEvent) => {
if (socket && canvasRef.current) {
Expand All @@ -59,7 +61,10 @@ const Canvas = ({ width, height, onCreateRef }: CanvasProps) => {
console.log('Capturing Text...');
} else if (getListRef.current === true) {
console.log('Capturing List...');
} else {
} else if (getListAutoRef.current === true) {
console.log('Capturing List Automatically...');
}
else {
socket.emit('input:mousedown', clickCoordinates);
}
notifyLastAction('click');
Expand Down
30 changes: 28 additions & 2 deletions src/components/molecules/ActionDescriptionBox.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,11 @@ const Content = styled.div`
`;

const ActionDescriptionBox = () => {
const { getText, getScreenshot, getList, captureStage } = useActionContext() as {
const { getText, getScreenshot, getList, captureStage, getListAuto } = useActionContext() as {
getText: boolean;
getScreenshot: boolean;
getList: boolean;
getListAuto: boolean;
captureStage: 'initial' | 'pagination' | 'limit' | 'complete';
};

Expand Down Expand Up @@ -99,7 +100,32 @@ const ActionDescriptionBox = () => {
</Box>
</>
);
} else {
} else if (getListAuto) {
return (
<>
<Typography variant="subtitle2" gutterBottom>Capture List</Typography>
<Typography variant="body2" gutterBottom>
Hover over the list you want to extract
</Typography>
<Box>
{messages.map(({ stage, text }, index) => (
<FormControlLabel
key={stage}
control={
<Checkbox
checked={index < currentStageIndex} // Check the box if we are past this stage
disabled
/>
}
label={<Typography variant="body2" gutterBottom>{text}</Typography>}
/>
))}
</Box>
</>
);
}

else {
return (
<>
<Typography variant="subtitle2" gutterBottom>What data do you want to extract?</Typography>
Expand Down
1 change: 0 additions & 1 deletion src/components/molecules/RobotEdit.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ interface RobotSettingsProps {
}

export const RobotEditModal = ({ isOpen, handleStart, handleClose, initialSettings }: RobotSettingsProps) => {
console.log("robot edit");
const [robot, setRobot] = useState<RobotSettings | null>(null);
const { recordingId, notify } = useGlobalInfoStore();

Expand Down
Loading