Difference between revisions of "Narrative Website Import"

From Gramps
Jump to: navigation, search
m (clarify & reword)
(6 intermediate revisions by 2 users not shown)
Line 1: Line 1:
 
{{languages}}
 
{{languages}}
 +
{{man warn|Outdated code|The Script below was written for the Gramps 2.2 NarrativeWeb and needs to be updated to work with a Gramps 3+ or higher versions.}}
 +
{{stub}}
 +
Use a '''Gramps-created [[Gramps_4.2_Wiki_Manual_-_Reports_-_part_7#Narrated_Web_Site|Narrative Website report]]''' to restore your Gramps database.
  
Use a '''GRAMPS-created Narrative Website report''' to restore your GRAMPS database.
+
The program below works by parsing the HTML website (also called "screen scrapping") and places the data into a [[Gramps_4.2_Wiki_Manual_-_Manage_Family_Trees:_CSV_Import_and_Export#Gramps_Spreadsheet_Import.2FExport|comma-separated value spreadsheet]]. You can then import it directly into Gramps.
  
Use the following code [http://bubo.brynmawr.edu/~dblank/getnarrative.py getnarrative.py] as a good starting point.  
+
To run the program from the command line, provide the URL of the surname list, like:
 +
<pre>
 +
python getnarrative.py http://somewebsite.com/myfamily/ > import.csv
 +
</pre>
 +
Then, in Gramps you should be able to import the file "import.csv" into an empty database.
 +
 
 +
{{man tip|Code|Use the following code [http://bubo.brynmawr.edu/~dblank/getnarrative.py getnarrative.py] (404 link gone) as a good starting point (As the Script was written for Gramps 2.2 NarrativeWeb and needs to be updated to work with a Gramps 3+ or higher version.<br /><pre>
 +
# Python script for sucking a GRAMPS Narrative Website back into
 +
# GRAMPS.
  
Read the following discussion about this code at http://www.nabble.com/Lost-grdb-td13952238.html
+
# By Doug Blank <dblank@cs.brynmawr.edu>
 +
# License: GPL
 +
# (c) 2007
  
The program works by parsing the HTML website (also called "screen scrapping") and places the data into a [[Gramps 3.1 Wiki Manual - Manage Family Trees: CSV Import and Export|comma-separated value spreadsheet]]. You can then import it directly into GRAMPS.
 
  
To run the program from the command line, provide the URL of the surname list, like:
+
import os, sys, urllib, re
 +
 
 +
count = 0
 +
person = {None: None}
 +
family = {}
 +
family_pair = {}
 +
event = {}
 +
 
 +
def loadPerson(url, surname, firstname):
 +
    global count, person, family, event, family_pair
 +
    junk, handle = url.rsplit("/",1)
 +
    handle = handle.replace(".html", "")
 +
    print >> sys.stderr, "  ", count, surname, ", ", firstname
 +
    count += 1
 +
    pfp = urllib.urlopen(gurl + "/" + url)
 +
    contents = pfp.read()
 +
    state = None
 +
    pairs = []
 +
    pdata = {}
 +
    children = []
 +
    for line in contents.split("\n"):
 +
        matches = re.findall("""<td class="(.*?)">(.*?)</td>""", line)
 +
        for match in matches:
 +
            key, data = match
 +
            if key in ["box"]:
 +
                pass # ignore
 +
            elif key in ["field", "data", "category"]:
 +
                pairs.append((key, data))
 +
        if state == "Families" and line.startswith("<a href"): # child?
 +
            matches = re.match("""<a href="(.*?)">(.*?)</a>.*""", line)
 +
            if matches:
 +
                match = matches.groups()[0]
 +
                if "/ppl/" in match:
 +
                    junk, chandle = match.rsplit("/", 1)
 +
                    chandle = chandle.replace(".html","")
 +
                    children.append(chandle)
 +
        elif "<h" in line:
 +
            matches = re.match("<h.>(.*?)</h.>", line)
 +
            if matches:
 +
                if state != None:
 +
                    if state == "Parents":
 +
                        #print "      Parents:", pairs
 +
                        father, mother = None, None
 +
                        for i in range(len(pairs)):
 +
                            if pairs[i][1] == "Father":
 +
                                father = pairs[i+1][1]
 +
                            if pairs[i][1] == "Mother":
 +
                                mother = pairs[i+1][1]
 +
                        if father:
 +
                            father = father.replace("</a>", "")
 +
                            if "/" in father:
 +
                                junk, fhandle = father.rsplit("/", 1)
 +
                                father, name = fhandle.split(".html",1)
 +
                        if mother:
 +
                            mother = mother.replace("</a>", "")
 +
                            if "/" in mother:
 +
                                junk, mhandle = mother.rsplit("/", 1)
 +
                                mother, name = mhandle.split(".html",1)
 +
                        if (father, mother) in family:
 +
                            family[(father, mother)].append(handle)
 +
                        else:
 +
                            family[(father, mother)] = [handle]
 +
                    elif state == "Families":
 +
                        #print "      Families:", pairs
 +
                        mdata = {"me": handle}
 +
                        mhandle = None
 +
                        for (key, value) in pairs:
 +
                            if key == "category":
 +
                                mdata["type"] = value
 +
                            elif key == "field":
 +
                                mdata["spouse"] = value
 +
                            elif key == "data":
 +
                                value = value.replace("</a>", "")
 +
                                if "/" in value:
 +
                                    junk, handle_name = value.rsplit("/", 1)
 +
                                    mhandle, name = handle_name.split(".html",1)
 +
                        handles = [handle, mhandle]
 +
                        handles.sort()
 +
                        #print "adding", handles, mdata
 +
                        family_pair[tuple(handles)] = mdata
 +
                    elif state == "Events":
 +
                        #print "      Events:", pairs
 +
                        event[(handle, pairs[0][1])] = pairs
 +
                    elif state.strip() == (firstname + " " + surname).strip():
 +
                        pdata = {"surname": surname,
 +
                                "firstname": firstname,
 +
                                "children": children,
 +
                                "suffix": ""}
 +
                        for i in range(len(pairs)):
 +
                            if pairs[i][0] == "field":
 +
                                pdata[pairs[i][1]] = pairs[i+1][1]
 +
                            i += 1
 +
                        person[handle] = pdata
 +
                    elif state == "Pedigree":
 +
                        state = None
 +
                    elif state in ["Ancestors", "Narrative"]:
 +
                        pass
 +
                    else: # name didn't match exactly
 +
                        state = state.replace(surname, "")
 +
                        state = state.replace(firstname, "")
 +
                        suffix = state.strip()
 +
                        pdata = {"surname": surname,
 +
                                "firstname": firstname,
 +
                                "children": children,
 +
                                "suffix": ""}
 +
                        if suffix:
 +
                            pdata["suffix"] = suffix
 +
                        for i in range(len(pairs)):
 +
                            if pairs[i][0] == "field":
 +
                                pdata[pairs[i][1]] = pairs[i+1][1]
 +
                            i += 1
 +
                        person[handle] = pdata
 +
                else:
 +
                    pass # new person
 +
                pairs = []
 +
                state = matches.groups()[0]
 +
 
 +
def loadSurname(url, surname):
 +
    sfp = urllib.urlopen(gurl + "/" + url)
 +
    contents = sfp.read()
 +
    for line in contents.split("\n"):
 +
        list = re.findall("""<a href="(.*?)">(.*?)</a>""", line)
 +
        for surnameURL in list:
 +
            url, firstname = surnameURL
 +
            if url.endswith(".html") and "/ppl/" in url:
 +
                prefix, purl = url.split("/ppl/")
 +
                loadPerson("/ppl/" + purl, surname, firstname)
 +
 
 +
 
 +
gurl = sys.argv[1] # URL of surnames
 +
fp = urllib.urlopen(gurl)
 +
contents = fp.read() # read in website
 +
for line in contents.split("\n"):
 +
    list = re.findall("""<a href="(.*?)">(.*?)</a>""", line)
 +
    for surnameURL in list:
 +
        url, surname = surnameURL
 +
        if url.endswith(".html") and url.startswith("srn"):
 +
            print >> sys.stderr, "Processing surname", surname, "..."
 +
            loadSurname(*surnameURL)
 +
 
 +
print "person,firstname,lastname,suffix,gender"
 +
for h in person:
 +
    if h:
 +
        print '"%s","%s","%s","%s","%s"' % (h, person[h]["firstname"],
 +
                                            person[h]["surname"],
 +
                                            person[h]["suffix"],
 +
                                            person[h]["Gender"])
 +
 
 +
for fam in family_pair:
 +
    data = family_pair[fam]
 +
    h1, h2 = fam
 +
    p1, p2 = None, None
 +
    if h1 in person:
 +
        p1 = person[h1]
 +
    if h2 in person:
 +
        p2 = person[h2]
 +
    if p1 and p2:
 +
        if p1["Gender"] == "male" and p2["Gender"] == "female":
 +
            if (h1, h2) in family:
 +
                family[(h1,h2)].append(data["me"])
 +
            else:
 +
                family[(h1,h2)] = [data["me"]]
 +
        else:
 +
            if (h2, h1) in family:
 +
                family[(h2,h1)].append(data["me"])
 +
            else:
 +
                family[(h2,h1)] = [data["me"]]
 +
 
 +
print
 +
print "marriage,parent1,parent2"
 +
count = 1
 +
marriage = {}
 +
for pair in family:
 +
    marriage[pair] = "F%04d" % count
 +
    print '"%s","%s","%s"' % (marriage[pair], pair[0], pair[1])
 +
    count += 1
  
python getnarrative.py http://somewebsite.com/myfamily/ > import.csv
+
print
 +
print "family,child"
 +
for pair in family:
 +
    kids = family[pair]
 +
    kids = set(kids)
 +
    for kid in kids:
 +
        if (kid != pair[0]) or (kid != pair[1]):
 +
            print '"%s","%s"' % (marriage[pair], kid)
 +
</pre>
 +
}}
  
Then, in GRAMPS you should be able to import the file "import.csv" into an empty database.
+
==See also==
 +
Read the following discussion about this code at [http://comments.gmane.org/gmane.comp.genealogy.gramps.user/4986 Lost grdb] & [http://gramps.1791082.n4.nabble.com/Re-return-from-NAVWEB-to-GRAMPS-and-NOT-HIDE-td3780312.html#a3780390]
  
 
[[Category:Documentation]]
 
[[Category:Documentation]]
 
[[Category:Developers/General]]
 
[[Category:Developers/General]]

Revision as of 07:49, 5 January 2017

Gnome-important.png
Outdated code

The Script below was written for the Gramps 2.2 NarrativeWeb and needs to be updated to work with a Gramps 3+ or higher versions.

Gramps-notes.png

This article's content is incomplete or a placeholder stub.
Please update or expand this section.


Use a Gramps-created Narrative Website report to restore your Gramps database.

The program below works by parsing the HTML website (also called "screen scrapping") and places the data into a comma-separated value spreadsheet. You can then import it directly into Gramps.

To run the program from the command line, provide the URL of the surname list, like:

 python getnarrative.py http://somewebsite.com/myfamily/ > import.csv

Then, in Gramps you should be able to import the file "import.csv" into an empty database.

Tango-Dialog-information.png
Code

Use the following code getnarrative.py (404 link gone) as a good starting point (As the Script was written for Gramps 2.2 NarrativeWeb and needs to be updated to work with a Gramps 3+ or higher version.
# Python script for sucking a GRAMPS Narrative Website back into
# GRAMPS.

# By Doug Blank <[email protected]>
# License: GPL
# (c) 2007


import os, sys, urllib, re

count = 0
person = {None: None}
family = {}
family_pair = {}
event = {}

def loadPerson(url, surname, firstname):
    global count, person, family, event, family_pair
    junk, handle = url.rsplit("/",1)
    handle = handle.replace(".html", "")
    print >> sys.stderr, "   ", count, surname, ", ", firstname
    count += 1
    pfp = urllib.urlopen(gurl + "/" + url)
    contents = pfp.read()
    state = None
    pairs = []
    pdata = {}
    children = []
    for line in contents.split("\n"):
        matches = re.findall("""<td class="(.*?)">(.*?)</td>""", line)
        for match in matches:
            key, data = match
            if key in ["box"]:
                pass # ignore
            elif key in ["field", "data", "category"]:
                pairs.append((key, data))
        if state == "Families" and line.startswith("<a href"): # child?
            matches = re.match("""<a href="(.*?)">(.*?)</a>.*""", line)
            if matches:
                match = matches.groups()[0]
                if "/ppl/" in match:
                    junk, chandle = match.rsplit("/", 1)
                    chandle = chandle.replace(".html","")
                    children.append(chandle)
        elif "<h" in line:
            matches = re.match("<h.>(.*?)</h.>", line)
            if matches:
                if state != None:
                    if state == "Parents":
                        #print "      Parents:", pairs
                        father, mother = None, None
                        for i in range(len(pairs)):
                            if pairs[i][1] == "Father":
                                father = pairs[i+1][1]
                            if pairs[i][1] == "Mother":
                                mother = pairs[i+1][1]
                        if father:
                            father = father.replace("</a>", "")
                            if "/" in father:
                                junk, fhandle = father.rsplit("/", 1)
                                father, name = fhandle.split(".html",1)
                        if mother:
                            mother = mother.replace("</a>", "")
                            if "/" in mother:
                                junk, mhandle = mother.rsplit("/", 1)
                                mother, name = mhandle.split(".html",1)
                        if (father, mother) in family:
                            family[(father, mother)].append(handle)
                        else:
                            family[(father, mother)] = [handle]
                    elif state == "Families":
                        #print "      Families:", pairs
                        mdata = {"me": handle}
                        mhandle = None
                        for (key, value) in pairs:
                            if key == "category":
                                mdata["type"] = value
                            elif key == "field":
                                mdata["spouse"] = value
                            elif key == "data":
                                value = value.replace("</a>", "")
                                if "/" in value:
                                    junk, handle_name = value.rsplit("/", 1)
                                    mhandle, name = handle_name.split(".html",1)
                        handles = [handle, mhandle]
                        handles.sort()
                        #print "adding", handles, mdata
                        family_pair[tuple(handles)] = mdata
                    elif state == "Events":
                        #print "      Events:", pairs
                        event[(handle, pairs[0][1])] = pairs
                    elif state.strip() == (firstname + " " + surname).strip():
                        pdata = {"surname": surname, 
                                 "firstname": firstname,
                                 "children": children,
                                 "suffix": ""}
                        for i in range(len(pairs)):
                            if pairs[i][0] == "field":
                                pdata[pairs[i][1]] = pairs[i+1][1]
                            i += 1
                        person[handle] = pdata
                    elif state == "Pedigree":
                        state = None
                    elif state in ["Ancestors", "Narrative"]:
                        pass
                    else: # name didn't match exactly
                        state = state.replace(surname, "")
                        state = state.replace(firstname, "")
                        suffix = state.strip()
                        pdata = {"surname": surname, 
                                 "firstname": firstname,
                                 "children": children,
                                 "suffix": ""}
                        if suffix:
                            pdata["suffix"] = suffix
                        for i in range(len(pairs)):
                            if pairs[i][0] == "field":
                                pdata[pairs[i][1]] = pairs[i+1][1]
                            i += 1
                        person[handle] = pdata
                else:
                    pass # new person
                pairs = []
                state = matches.groups()[0]

def loadSurname(url, surname):
    sfp = urllib.urlopen(gurl + "/" + url)
    contents = sfp.read()
    for line in contents.split("\n"):
        list = re.findall("""<a href="(.*?)">(.*?)</a>""", line)
        for surnameURL in list:
            url, firstname = surnameURL
            if url.endswith(".html") and "/ppl/" in url:
                prefix, purl = url.split("/ppl/")
                loadPerson("/ppl/" + purl, surname, firstname)


gurl = sys.argv[1] # URL of surnames
fp = urllib.urlopen(gurl) 
contents = fp.read() # read in website
for line in contents.split("\n"):
    list = re.findall("""<a href="(.*?)">(.*?)</a>""", line)
    for surnameURL in list:
        url, surname = surnameURL
        if url.endswith(".html") and url.startswith("srn"):
            print >> sys.stderr, "Processing surname", surname, "..."
            loadSurname(*surnameURL)

print "person,firstname,lastname,suffix,gender"
for h in person:
    if h:
        print '"%s","%s","%s","%s","%s"' % (h, person[h]["firstname"], 
                                            person[h]["surname"], 
                                            person[h]["suffix"], 
                                            person[h]["Gender"])

for fam in family_pair:
    data = family_pair[fam]
    h1, h2 = fam
    p1, p2 = None, None
    if h1 in person:
        p1 = person[h1]
    if h2 in person:
        p2 = person[h2]
    if p1 and p2:
        if p1["Gender"] == "male" and p2["Gender"] == "female":
            if (h1, h2) in family:
                family[(h1,h2)].append(data["me"])
            else:
                family[(h1,h2)] = [data["me"]]
        else:
            if (h2, h1) in family:
                family[(h2,h1)].append(data["me"])
            else:
                family[(h2,h1)] = [data["me"]]

print
print "marriage,parent1,parent2"
count = 1
marriage = {}
for pair in family:
    marriage[pair] = "F%04d" % count
    print '"%s","%s","%s"' % (marriage[pair], pair[0], pair[1])
    count += 1

print
print "family,child"
for pair in family:
    kids = family[pair]
    kids = set(kids)
    for kid in kids:
        if (kid != pair[0]) or (kid != pair[1]):
            print '"%s","%s"' % (marriage[pair], kid)


See also

Read the following discussion about this code at Lost grdb & [1]