Sunday 17 July 2016

Lex Yacc Parser For HTML Processing

Lex Yacc Parser for html web pages
LEX File
%{
#include<stdio.h>
#include<math.h>
#include<string.h>
#include "cali.tab.h"
int yywrap(void);
#include<stdlib.h>
%}
%%
\<fieldset[^\>]*\>\<legend\>(Award|Fellow|Patent|Member.?.?Editorial.?Board|Copyrights|Text) {
strcpy(yylval.cval, yytext);
return UNWANTED;
}
\<fieldset[^\>]*\>\<legend\> {
strcpy(yylval.cval, yytext);
return BEGINFIELD;
}
\<\/fieldset\> {
strcpy(yylval.cval, yytext);
return ENDFIELD;
}
\<fieldset[^\>]*\>\<!--<legend\>.?[a-zA-Z\.]*[ \r]?[a-zA-Z\.]*[ \r]?[a-zA-Z\.]* {
strcpy(yylval.cval, "<fieldset>");
strtok(yytext, ">");
strtok(NULL, ">");
char *name = strtok(NULL, ">");
FILE *fp = fopen("out.txt", "a");
fprintf(fp, "Name$%s\n", name);
fclose(fp);
printf("Name$%s\n", name);
return NAME;
}
Contact[ \r]?Addresses {
strcpy(yylval.cval, yytext);
return CONTACT;
}
\+[0-9\- \r]{18} {
strcpy(yylval.cval, yytext);
return PHONE;
}
Research[ \r]?Areas {
strcpy(yylval.cval, yytext);
return RESAREA;
}
Member[ \r]?of[ \r]?Professional[ \r]?Bodies {
FILE *fp = fopen("out.txt", "a");
fprintf(fp, "%s", "\nMember of Professional Bodies$");
fclose(fp);
strcpy(yylval.cval, yytext);
return RESAREA;
}
Current[ \r]?Sponsored[ \r]?Projects {
FILE *fp = fopen("out.txt", "a");
fprintf(fp, "%s", "\nCurrent Sponsored Projects");
fclose(fp);
strcpy(yylval.cval, yytext);
return SPONSORED;
}
Project[ \r]?Title {
FILE *fp = fopen("out.txt", "a");
fprintf(fp, "%s", "$");
fclose(fp);
}
On-going[ \r]?Consultancy[ \r]?Projects {
FILE *fp = fopen("out.txt", "a");
fprintf(fp, "%s", "\nOn-going Consultancy Projects");
fclose(fp);
strcpy(yylval.cval, yytext);
return SPONSORED;
}
Project[ \r]?Name {
FILE *fp = fopen("out.txt", "a");
fprintf(fp, "%s", "$");
fclose(fp);
}
[A-Z0-9\-\/ \r]{2,10} {
strcpy(yylval.cval, yytext);
return QTRNO;
}
Publications:([ \r]?[0-9]{4}[ \r]?\-[ \r]?[0-9]{4}) {
strcpy(yylval.cval, yytext);
return PUBLICATIONS;
}
Some[ \r]?Earlier[ \r]?Publications {
strcpy(yylval.cval, yytext);
return PUBLICATIONS;
}
\([0-9]{4}\)  {
strcpy(yylval.cval, yytext);
      return YR;
}
\<[^\>]*\> {
strcpy(yylval.cval, yytext);
return TAG;
}
[a-zA-Z0-9:\.\& \r\n\t\-\/]+ {
strcpy(yylval.cval, yytext);
return VALUE;
}
. ;
%%
int yywrap(){
printf("End of parsing \n");
}
void yyerror(const char *str){
printf(" Invalid Character... \n");
}
int main(int argc, char** argv) {
FILE *fp = fopen("out.txt", "w");
fclose(fp);
        if(argc !=2){
             printf("Provide input fiile\n");
             return 0;
        }
printf("Start of parsing \n");
yyin = fopen(argv[1], "r");
while (!feof(yyin)){
yyparse();
}
usleep(10);
process();
  return 0;
}
YACC File
%{
#include<stdio.h>
#include<string.h>
int yylex(void);
void yyerror(const char *s);
%}
%union
{
char cval[1500];
}
%token <cval> BEGINFIELD
%token <cval> ENDFIELD
%token <cval> NAME
%token <cval> TAG
%token <cval> VALUE
%token <cval> CONTACT
%token <cval> QTRNO
%token <cval> UNWANTED
%token <cval> PHONE
%token <cval> RESAREA
%token <cval> MEMBER
%token <cval> SPONSORED
%token <cval> PUBLICATIONS
%token <cval> YR
%type <cval> S N C R U SP P
%%
state : S
;
S : S VALUE
| VALUE 
| S TAG 
| TAG  
| S QTRNO 
| QTRNO 
| S NAME N ENDFIELD
| S BEGINFIELD CONTACT C ENDFIELD  {
FILE *fp = fopen("out.txt", "a");
fprintf(fp, "%s", "\nResearch Areas$");
fclose(fp);
}
| S BEGINFIELD RESAREA R ENDFIELD
| S UNWANTED U ENDFIELD
| S BEGINFIELD SPONSORED SP ENDFIELD
| S BEGINFIELD PUBLICATIONS P ENDFIELD
| S CONTACT
| CONTACT
| S PHONE
| PHONE
| UNWANTED
| PUBLICATIONS
| S PUBLICATIONS
| S YR
| YR
| S RESAREA
| RESAREA
| S MEMBER
| MEMBER
| SPONSORED
| S SPONSORED
;
N : VALUE 
  | N VALUE {
  char post[100];
sprintf(post, "%s", $2);
if(strcasestr(post, "Professor") || strcasestr(post, "Director") || strcasestr(post, "Dean") || strcasestr(post, "Faculty") || strcasestr(post, "Lecturer") || strstr(post, "Network Engineer") || strcasestr(post, "Superintendent") || strcasestr(post, "Registrar") || strcasestr(post, "Librarian") ){
printf("Designation$%s\n", post+2);
  FILE *fp = fopen("out.txt", "a");
fprintf(fp, "Designation$%s\n", post+2);
  fclose(fp);
}
}
| N TAG {
FILE *fp = fopen("out.txt", "a");
fprintf(fp, "%s", " ");
fclose(fp);
}
| TAG {
FILE *fp = fopen("out.txt", "a");
fprintf(fp, "%s", " ");
fclose(fp);
}
| N QTRNO
| QTRNO
| N CONTACT
| CONTACT
| N PHONE
| PHONE
| N YR
| YR
;
C : VALUE
| C VALUE 
| C TAG {
FILE *fp = fopen("out.txt", "a");
fprintf(fp, "%s", " ");
fclose(fp);
}
| TAG {
FILE *fp = fopen("out.txt", "a");
fprintf(fp, "%s", " ");
fclose(fp);
}
| C QTRNO {
FILE *fp = fopen("out.txt", "a");
fprintf(fp, "QTRNO$%s\n", $2);
fclose(fp);
}
| QTRNO {
FILE *fp = fopen("out.txt", "a");
fprintf(fp, "QTRNO$%s\n", $1);
fclose(fp);
}
| PHONE {
FILE *fp = fopen("out.txt", "a");
fprintf(fp, "PHONE$%s\n", $1);
fclose(fp);
}
| C PHONE {
FILE *fp = fopen("out.txt", "a");
fprintf(fp, "PHONE$%s\n", $2);
fclose(fp);
}
| C YR
| YR
;
R : VALUE {
FILE *fp = fopen("out.txt", "a");
fprintf(fp,"%s$", $1);
  fclose(fp);
}
  | R VALUE {
FILE *fp = fopen("out.txt", "a");
fprintf(fp,"%s$", $2);
fclose(fp);
}
| QTRNO
| R QTRNO
| R TAG {
FILE *fp = fopen("out.txt", "a");
fprintf(fp, "%s", " ");
fclose(fp);
}
| TAG {
FILE *fp = fopen("out.txt", "a");
fprintf(fp, "%s", " ");
fclose(fp);
}
| R YR
| YR
;
U : VALUE
| U VALUE
| U TAG
| TAG
| U QTRNO
| QTRNO
| U YR
| YR
| U PHONE
| PHONE
;
SP : VALUE {
FILE *fp = fopen("out.txt", "a");
fprintf(fp,"%s", $1);
fclose(fp);
}
| SP VALUE {
FILE *fp = fopen("out.txt", "a");
fprintf(fp,"%s", $2);
fclose(fp);
}
| SP TAG {
FILE *fp = fopen("out.txt", "a");
fprintf(fp, "%s", " ");
fclose(fp);
}
| TAG {
FILE *fp = fopen("out.txt", "a");
fprintf(fp, "%s", " ");
fclose(fp);
}
| SP YR
| YR
| SP QTRNO
| QTRNO
;
P : VALUE {
FILE *fp = fopen("out.txt", "a");
fprintf(fp,"%s", $1);
fclose(fp);
}
| P VALUE {
FILE *fp = fopen("out.txt", "a");
fprintf(fp,"%s", $2);
fclose(fp);
}
| P TAG {
FILE *fp = fopen("out.txt", "a");
fprintf(fp, "%s", " ");
fclose(fp);
}
| TAG {
FILE *fp = fopen("out.txt", "a");
fprintf(fp, "%s", " ");
fclose(fp);
}
| P YR {
FILE *fp = fopen("out.txt", "a");
fprintf(fp, "%s%s%s", "$", $2, "$$");
fclose(fp);
}
| YR {
FILE *fp = fopen("out.txt", "a");
fprintf(fp, "%s%s%s", "$", $1, "$$");
fclose(fp);
}
| P QTRNO
| QTRNO
;
%%
Make file
all : yaccCompile lexCompile compile1
yaccCompile:
bison -d cali.y
lexCompile:
lex cali.l
compile1:
gcc cali.tab.c lex.yy.c -o parser.out
Share:

0 comments:

Post a Comment

Contact Me

Name

Email *

Message *

Popular Posts

Blog Archive