Extract all URL links out of Google Slides presentation with Google Apps Script

Question

I am trying to create a function that, when passed a Google Slides presentation ID, can parse the presentation and write all URL links it finds to a Google Sheet. I have built adapted the following function to do the same for a Google Docs document input based off of this answer from @Yuval

function getAllLinks(docId, mergeAdjacent) {
  var links = [];

  //var doc = DocumentApp.getActiveDocument();
  var doc = DocumentApp.openById(docId);
  var parentDocName = doc.getName();
  var ss=SpreadsheetApp.getActive();
  var sh=ss.getSheetByName('Extracted Links');

  iterateSections(doc, function(section, sectionIndex, isFirstPageSection) {
    if (!("getParagraphs" in section)) {
      // as we're using some undocumented API, adding this to avoid cryptic
      // messages upon possible API changes.
      throw new Error("An API change has caused this script to stop " + 
                      "working.
" +
                      "Section #" + sectionIndex + " of type " + 
                      section.getType() + " has no .getParagraphs() method. " +
        "Stopping script.");
    }

    section.getParagraphs().forEach(function(par) { 
      // skip empty paragraphs
      if (par.getNumChildren() == 0) {
        return;
      }

      // go over all text elements in paragraph / list-item
      for (var el=par.getChild(0); el!=null; el=el.getNextSibling()) {
        if (el.getType() != DocumentApp.ElementType.TEXT) {
          continue;
        }

        // go over all styling segments in text element
        var attributeIndices = el.getTextAttributeIndices();
        var lastLink = null;
        attributeIndices.forEach(function(startOffset, i, attributeIndices) { 
          var url = el.getLinkUrl(startOffset);

          if (url != null) {
            // we hit a link
            var endOffsetInclusive = (i+1 < attributeIndices.length? 
                                      attributeIndices[i+1]-1 : null);

            // check if this and the last found link are continuous
            if (mergeAdjacent && lastLink != null && lastLink.url == url && 
                  lastLink.endOffsetInclusive == startOffset - 1) {
              // this and the previous style segment are continuous
              lastLink.endOffsetInclusive = endOffsetInclusive;
              return;
            }

            lastLink = {
              "section": section,
              "isFirstPageSection": isFirstPageSection,
              "paragraph": par,
              "textEl": el,
              "startOffset": startOffset,
              "endOffsetInclusive": endOffsetInclusive,
              "url": url
            };
            var row = sh.getLastRow() + 1;
            var r1=sh.getRange(row, 1);
            r1.setValue(parentDocName);
            var r2=sh.getRange(row, 2);
            r2.setValue(url);
            Logger.log(parentDocName)
            Logger.log(url)
            links.push(lastLink);
          }        
        });
      }
    });
  });


  return links;
}

/**
 * Calls the given function for each section of the document (body, header, 
 * etc.). Sections are children of the DocumentElement object.
 *
 * @param {Document} doc The Document object (such as the one obtained via
 *     a call to DocumentApp.getActiveDocument()) with the sections to iterate
 *     over.
 * @param {Function} func A callback function which will be called, for each
 *     section, with the following arguments (in order):
 *       - {ContainerElement} section - the section element
 *       - {Number} sectionIndex - the child index of the section, such that
 *         doc.getBody().getParent().getChild(sectionIndex) == section.
 *       - {Boolean} isFirstPageSection - whether the section is a first-page
 *         header/footer section.
 */
function iterateSections(doc, func) {
  // get the DocumentElement interface to iterate over all sections
  // this bit is undocumented API
  var docEl = doc.getBody().getParent();

  var regularHeaderSectionIndex = (doc.getHeader() == null? -1 : 
                                   docEl.getChildIndex(doc.getHeader()));
  var regularFooterSectionIndex = (doc.getFooter() == null? -1 : 
                                   docEl.getChildIndex(doc.getFooter()));

  for (var i=0; i<docEl.getNumChildren(); ++i) {
    var section = docEl.getChild(i);

    var sectionType = section.getType();
    var uniqueSectionName;
    var isFirstPageSection = (
      i != regularHeaderSectionIndex &&
      i != regularFooterSectionIndex && 
      (sectionType == DocumentApp.ElementType.HEADER_SECTION ||
       sectionType == DocumentApp.ElementType.FOOTER_SECTION));

    func(section, i, isFirstPageSection);
  }
}

When I try to create the same for a Google Slides presentation as the input, I am getting stuck on the step of how to parse through the document and extract all of the text bits (in order to check them for links). It seems like I would need to use getSlides(), and then getPageElements() and iterate through those, but I am unclear on how to get to the actual text on the slides. Any tips on how to get iterate through the actual text on slides (and potentially how to extract the link URL out of that text if it has one) would be much appreciated. Thank you!

NightEye · Accepted Answer

If you just wan't to get the links from the slides, see the code below:

Code:

function getLinksFromSlides() {
  var presentation = SlidesApp.getActivePresentation();
  var slides = presentation.getSlides();
  // traverse each slide
  slides.forEach(function (slide) {
    var shapes = slide.getShapes();
    // traverse each shape
    shapes.forEach(function (shape) {
      // get its text content
      var textRange = shape.getText();
      var links = textRange.getLinks();
      // print all links found
      links.forEach(link => Logger.log(link.getTextStyle().getLink().getUrl()));
    });
  });
}

Sample:

sample

Output:

output

Note:

This only extracts the hyperlinks. It doesn't extract any links/url that isn't hyperlinked as shown in the sample data. (e.g. https://www.facebook.com)
If you want the non-hyperlink urls, then you might have to try regex.

Extract all URL links out of Google Slides presentation with Google Apps Script

Tags:

javascript

google-apps-script

google-slides

Gabriel Tero

1 Answers

Code:

Sample:

Output:

Note:

NightEye

Recent Activity

Donate For Us

Extract all URL links out of Google Slides presentation with Google Apps Script

Tags:

javascript

google-apps-script

google-slides

Gabriel Tero

1 Answers

Code:

Sample:

Output:

Note:

NightEye

Related questions

Recent Activity

Donate For Us