[Bio] / Billogix / url_data.c Repository:
ViewVC logotype

View of /Billogix/url_data.c

Parent Directory Parent Directory | Revision Log Revision Log

Revision 1.2 - (download) (as text) (annotate)
Mon Jun 14 22:04:49 2004 UTC (15 years, 11 months ago) by olson
Branch: MAIN
CVS Tags: merge-bodev_news-3, merge-bobdev_news-2, caBIG-05Apr06-00, merge-trunktag-bobdev_news-2, merge-trunktag-bobdev_news-1, caBIG-00-00-00, myrast_33, caBIG-13Feb06-00, merge-bobdev_news-1, merge-trunktag-bodev_news-3, Root-bobdev_news, HEAD
Branch point for: Branch-bobdev_news
Changes since 1.1: +5 -3 lines
Make integration chagnes

/* This version uses a shell command and temporary file. */

#include <gprolog.h>

  In the data fetched from the URL, assume:
    1. Fields are *SEPARATED* by \t.
    2. Records are *TERMINATED* by \n.

  We'll build a list (a Relation) of lists (Tuples) of fields as a Prolog term.

    1. This was orignally meant to fetch relations (hence the terminology),
       but it is used for other kinds of data, too.
    2. The Tuples and the Relation get built (and returned) backward.
    3. Everything (including all whitespace except \t and \n) go into fields.
    4. Extra tabs cause empty fields, and
       extra newlines cause empty tuples.
    5. No checks are made to see if the data makes sense as a
       relation.  For example, tuples can be of different lengths,
       and HTML tags are not removed.

#define FMAX 5000  /* maximum munber of characters in a field */

PlTerm read_relation(FILE *fp)
  char field[FMAX+1];  /* allow for null char at end */
  int f;               /* current position in field */
  PlTerm tuple;        /* tuple (list of fields) being constructed */
  PlTerm relation;     /* relation (list of tuples) being constructed */

  int c;

  /* Initialize tuple and relation to empty lists. */

  tuple = Mk_Atom(atom_nil);
  relation = Mk_Atom(atom_nil);
  f = 0;

  while ((c = getc(fp)) != EOF) {

    if (c != '\t' && c != '\n') {
      /* Try to add the character to the field. */
      if (f == FMAX) {
	field[FMAX] = '\0';
	fprintf(stderr, "read_relation, field too big:%s...\n", field);
	return 0;  /* a PlTerm is really a long, so I assume this a nonterm */
	field[f++] = c;

    else {
      /* We have a field.  Prepend field to tuple. */
      PlTerm args[2];  /* for building binary terms */
      field[f] = '\0';

      args[0] = Mk_Atom(Create_Allocate_Atom(field));
      args[1] = tuple;
      tuple = Mk_List(args);

      if (c == '\n') {
	/* We have a record.  Prepend tuple to relation. */
	args[0] = tuple;
	args[1] = relation;
	relation = Mk_List(args);

	/* Initialize tuple for next record. */
	tuple = Mk_Atom(atom_nil);
      f = 0;  /* Start collecting next field. */

  if (f != 0) {  /* field not empty, so last character is neither \t nor \n. */
    fprintf(stderr, "\nread_relation: the data does not end with a newline.\n\n");
    return 0;
  else if (!Blt_Atom(tuple)) {  /* tuple not empty, so last character is \t. */
    fprintf(stderr, "\nread_relation: the data ends with a tab.\n\n");
    return 0;
  else { /* All is well. */
    return relation;
}  /* read_relation */



  This is the routine that is called by gprolog.  Declare it in prolog as:

      :- foreign(backward_data_from_url(+string, -term)).

  The returned relation is backward, and each tuple is backward.

  Remember to enclose the url in SINGLE quotes, like this:

  backward_data_from_url('http://www-unix.mcs.anl.gov/~mccune/misc/test2.html', X).

  If the HTML server is not found or times out, the call fails.

  If the sever responds "(404) File not found" or something similar,
  the call succeeds, returning the HTML of the error message.



   This version uses "system" to run curl or wget as a shell command,
   putting the data in a temporary file, then it reads the file
   and constructs the relation.

   Originally, I used libcurl to get the data directly.  That worked
   nicely in Linux, but I couldn't get it to work on my Mac (gprolog
   kept crashing).

   Note that curl does not do automatic redirects or URL encoding, so
   wget is probably better.  However, wget has to be installed on the Mac.

   See http://www.blooberry.com/indexdot/html/topics/urlencoding.htm


Bool backward_data_from_url(char *url, PlTerm *relation)
  int rc;
  FILE *fp;
  char tmpfile[50];
  char *command = malloc(strlen(url) + 50);

  sprintf(tmpfile, "/tmp/gprolog_bdfu_%d", getpid());

/* #define USE_CURL */
#ifdef USE_CURL
  sprintf(command, "curl \"%s\" > %s 2> /dev/null", url, tmpfile);
  sprintf(command, "wget -O %s \"%s\" 2> /dev/null", tmpfile, url);

  rc = system(command);
  *relation = 0;

  if (rc != 0) {
    fprintf(stderr, "\ncommand \"%s\" fails with code %d.\n\n", command, rc);
  else {
    fp = fopen(tmpfile, "r");
    if (fp == NULL) {
      fprintf(stderr, "\nerror opening URL data file %s.\n\n", tmpfile);
    else {
      *relation = read_relation(fp);

  /* clean up */

  sprintf(command, "/bin/rm -f %s", tmpfile);
  rc = system(command);

  if (*relation == 0)
    return FALSE;
    return TRUE;
}  /* backward_data_from_url */

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3